In [1]:
pip install kneed
Collecting kneed
  Downloading kneed-0.8.5-py3-none-any.whl.metadata (5.5 kB)
Requirement already satisfied: numpy>=1.14.2 in /usr/local/lib/python3.12/dist-packages (from kneed) (2.0.2)
Requirement already satisfied: scipy>=1.0.0 in /usr/local/lib/python3.12/dist-packages (from kneed) (1.16.3)
Downloading kneed-0.8.5-py3-none-any.whl (10 kB)
Installing collected packages: kneed
Successfully installed kneed-0.8.5
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
In [3]:
# Reading in the data

df_tmb = pd.read_parquet('/content/final_summary_20251017.parquet')
df_tmb.head()
Out[3]:
cartridgeSerialNumber error_first rawTestDetail.testResultsInfo.errorCode_first warnings_first readerFirmwareVersion_first readerHardwareVersion_first readerSerialNumber_first cartType_first procedureId_first procedureName_first ... we5_step93_max we6_step93_mean we6_step93_min we6_step93_max we1_step66_tailmean we2_step66_tailmean we3_step66_tailmean we4_step66_tailmean we5_step66_tailmean we6_step66_tailmean
0 0 81 81.0 [4, 74, 72, 80] 1.5.0-beta.10-mux-leak-check 15 12102010125415 0 FB00602C-F296-432B-AF27-9F1E2542A7A9 745 MUX (666 Flu+CoV PID, modified RV flow check) ... NaN NaN NaN NaN -12.540335 0.0 0.0 0.0 0.0 0.0
1 137051097 0 0.0 [6] 1.5.0-beta.4-6-gab63b390-mux 15 22102010204918 19 FB00602C-F296-432B-AF27-9F1E2542A7A9 745 MUX (666 Flu+CoV PID, modified RV flow check) ... NaN NaN NaN NaN -53.839251 0.0 0.0 0.0 0.0 0.0
2 137051110 0 0.0 [] 1.5.0-beta.4-6-gab63b390-mux 15 12202010076543 19 FB00602C-F296-432B-AF27-9F1E2542A7A9 745 MUX (666 Flu+CoV PID, modified RV flow check) ... NaN NaN NaN NaN -27.299443 0.0 0.0 0.0 0.0 0.0
3 137051120 0 0.0 [] 1.5.0-beta.4-6-gab63b390-mux 15 22102010205235 19 FB00602C-F296-432B-AF27-9F1E2542A7A9 745 MUX (666 Flu+CoV PID, modified RV flow check) ... NaN NaN NaN NaN -84.461012 0.0 0.0 0.0 0.0 0.0
4 137051166 0 0.0 [] 1.5.0-beta.5-17-gc522bbff-mux-record-mask 15 22102010205235 19 FB00602C-F296-432B-AF27-9F1E2542A7A9 745 MUX (666 Flu+CoV PID, modified RV flow check) ... NaN NaN NaN NaN -60.751939 0.0 0.0 0.0 0.0 0.0

5 rows × 133 columns

In [4]:
# Reading in the testing sheet data and joining them together

df_siro = pd.read_csv('/content/SIRO Testing Sheet - FABC Testing.csv', header=[1])

df_archive = pd.read_csv('/content/Siro Test Data Archive - FABC Data.csv')

# Joining df_siro and df_archive

df_siro = pd.concat([df_siro, df_archive])
df_siro.shape
Out[4]:
(146332, 40)
In [5]:
# Checking for overlap between dataframes

common = set(df_siro["Cartridge SN"]) & set(df_tmb["cartridgeSerialNumber"])
print("Overlap count:", len(common))
print("df_siro count:", df_siro["Cartridge SN"].nunique())
print("df count:", df_tmb["cartridgeSerialNumber"].nunique())



# concatenating df_tmb with df_siro by cartridgeSerialNumber by bringing over selected columns

# Ensure consistent types
df_siro["Cartridge SN"] = df_siro["Cartridge SN"].astype(str).str.strip()
df_tmb["cartridgeSerialNumber"] = df_tmb["cartridgeSerialNumber"].astype(str).str.strip()
df_siro["Cartridge SN"] = df_siro["Cartridge SN"].str.replace(r"\.0$", "", regex=True)
df_tmb["cartridgeSerialNumber"] = df_tmb["cartridgeSerialNumber"].str.replace(r"\.0$", "", regex=True)


# Merge on cartridge serial number
merged_df = pd.merge(
    df_siro,
    df_tmb,
    left_on="Cartridge SN",
    right_on="cartridgeSerialNumber",
    how="inner"   # or "inner" if you only want matching rows
)

# Optional: sanity check
print("Merged shape:", merged_df.shape)
Overlap count: 40667
df_siro count: 129538
df count: 52493
Merged shape: (64888, 173)
In [6]:
merged_df['cartridgeSerialNumber'].nunique()
Out[6]:
43236
In [7]:
# making a copy of our df

merged_df_copy = merged_df.copy()
In [8]:
# Checking for NaN's

merged_df.isna().sum()
Out[8]:
0
Study Label 0
Pellet 1579
Variable A 2201
Variable B 30813
Variable C 62419
... ...
we2_step66_tailmean 0
we3_step66_tailmean 0
we4_step66_tailmean 0
we5_step66_tailmean 0
we6_step66_tailmean 0

173 rows × 1 columns


In [9]:
# dropping decimal from 'Reader SN'

# merged_df['Reader SN'] = merged_df['Reader SN'].astype(str).str.replace('.0', '', regex=False)
In [10]:
# Formatting 'Date Tested'

merged_df['Date Tested'] = pd.to_datetime(
    merged_df['Date Tested'],
    format='mixed',
    dayfirst=True,
    errors='coerce'
)
merged_df['Date Tested'] = merged_df['Date Tested'].dt.strftime('%d-%b-%Y')
In [11]:
# Formatting 'Mfg Date'

merged_df['Mfg Date'] = pd.to_datetime(
    merged_df['Mfg Date'],
    format='mixed',
    dayfirst=True,
    errors='coerce'
    )
merged_df['Mfg Date'] = merged_df['Mfg Date'].dt.strftime('%d-%b-%Y')
In [12]:
# Formatting 'Reader SN' to int

merged_df['Reader SN'] = pd.to_numeric(
    merged_df['Reader SN'],
    errors='coerce'  # invalid strings become NaN
).astype('Int64')     # nullable integer type (keeps NaN)
In [13]:
# Formatting 'rawTestDetail.testResultsInfo.errorCode_first' to int

merged_df['rawTestDetail.testResultsInfo.errorCode_first'] = pd.to_numeric(
    merged_df['rawTestDetail.testResultsInfo.errorCode_first'],
    errors='coerce'  # invalid strings become NaN
).astype('Int64')
In [14]:
# cols_to_int = ['Warnings', 'Errors', 'warnings_first']

# for col in cols_to_int:
#     merged_df[col] = pd.to_numeric(
#         merged_df[col], errors='coerce'  # invalid values become NaN
#     ).astype('Int64')  # nullable integer type
In [15]:
merged_df['Warnings'].unique()
Out[15]:
array([nan, '[6]', '[20, 93, 80]', '[20, 93]', '[83]', '[80]', '[6, 20]',
       '[20]', '[24]', '[68, 74, 72]', '[14, 93, 30]', '[72]', '[6, 83]',
       '[6, 80]', '[6, 93]', '[10, 20, 93, 80]', '[93, 30]',
       '[6, 20, 93, 80]', '[93, 80]', '[93]', '[74]', '[29]', '[95]',
       '83', '6', '[95, 80]', '[95, 6]', '[95, 96, 80]', '[95, 20]',
       '[29, 6]', '[74, 6, 80]', '[6, 20, 93]', '[10, 20, 80]',
       '[6, 93, 30]', '[95, 6, 80]', '[95, 96, 6, 20, 93, 80]',
       '[95, 96, 20, 93, 80]', '[95, 96]', '[96, 20, 93, 80]',
       '[95, 96, 20, 93]', '6, 20, 93, 80', '[13, 80]', '[88, 6, 28]',
       '[7, 74, 72]', '[6, 20, 80]', '[88]', '[7, 74, 72, 6]', '[84]',
       '[83, 6]', '80', '20, 93, 80', '6, 80', '20', '[68, 74, 80]',
       '93, 80', '[96, 83, 80]', '29, 6', '7, 74, 72', '[4, 74, 72, 6]',
       '[20, 80]', '[96, 93, 80]', '[96, 80]', '[96, 20, 93]',
       '[96, 6, 20, 93, 80]', '[96, 6, 80]', '[14, 93]',
       '[7, 74, 96, 72, 20, 93, 80]', '[7, 74, 96, 72]', '[96, 20, 80]',
       '[96, 6, 20, 93]', '[96, 20]', '[7, 74, 96, 72, 20, 93]',
       '[96, 83, 20, 93, 80]', '[7, 74, 96, 72, 20, 80]', '[6, 12]',
       '[96]', '[7, 74, 72, 80]', '[96, 6, 20]', '[96, 6]',
       '[96, 6, 20, 80]', '[96, 93]', '[7, 74, 96, 72, 80]',
       '[13, 74, 93, 80]', '[12, 74, 72]', '[6, 14, 93]',
       '[7, 74, 72, 6, 80]', '[24, 88]', '[31]', '[72, 80]', '[11]',
       '[7, 74, 96, 72, 93, 80]', '[7, 74, 95, 72]', '[31, 74]',
       '[24, 72, 6]', '[7, 74, 72, 20]'], dtype=object)
In [16]:
merged_df.columns.to_list()
Out[16]:
['Study Label',
 'Pellet',
 'Variable A',
 'Variable B',
 'Variable C',
 'Variable D',
 'Procedure',
 'Test Sample',
 'RP Type',
 'RP Cell Count',
 'RP Lot',
 'Flu A Strain',
 'Flu A Copies',
 'Flu A Lot',
 'Flu B Strain',
 'Flu B Copies',
 'Flu B Lot',
 'CoV Strain',
 'CoV Copies',
 'CoV Lot',
 'Handling Timepoint',
 'Notes',
 'Operator Initial',
 'Bag',
 'Cartridge SN',
 'Copies',
 'Lot Number',
 'Reader SN',
 'Mfg Date',
 'Date Tested',
 'WE1',
 'WE2',
 'WE3',
 'WE4',
 'WE5',
 'WE6',
 'Warnings',
 'Errors',
 'Excluded because',
 'Testing Verification',
 'cartridgeSerialNumber',
 'error_first',
 'rawTestDetail.testResultsInfo.errorCode_first',
 'warnings_first',
 'readerFirmwareVersion_first',
 'readerHardwareVersion_first',
 'readerSerialNumber_first',
 'cartType_first',
 'procedureId_first',
 'procedureName_first',
 'rawTestDetail.testResults.weValues_first',
 'rawTestDetail.testResults.assayResults_first',
 'we1_result_first',
 'we2_result_first',
 'we3_result_first',
 'we4_result_first',
 'we5_result_first',
 'we6_result_first',
 'swabDetMv_mean',
 'swabDetMv_min',
 'swabDetMv_max',
 'cartDetMv_mean',
 'cartDetMv_min',
 'cartDetMv_max',
 'envTemp_mean',
 'envTemp_min',
 'envTemp_max',
 'nrfTemp_mean',
 'nrfTemp_min',
 'nrfTemp_max',
 'cartTemp_mean',
 'cartTemp_min',
 'cartTemp_max',
 'piezoVoltageMv_mean',
 'piezoVoltageMv_min',
 'piezoVoltageMv_max',
 'currentMa_mean',
 'currentMa_min',
 'currentMa_max',
 'voltage_mean',
 'voltage_min',
 'voltage_max',
 'piezoQCurrentMa_mean',
 'piezoQCurrentMa_min',
 'piezoQCurrentMa_max',
 'piezoInductorTemp_mean',
 'piezoInductorTemp_min',
 'piezoInductorTemp_max',
 'piezoOnTimeMs_mean',
 'piezoOnTimeMs_min',
 'piezoOnTimeMs_max',
 'targetTemp_mean',
 'targetTemp_min',
 'targetTemp_max',
 'deviceState_mean',
 'deviceState_min',
 'deviceState_max',
 'percent_mean',
 'percent_min',
 'percent_max',
 'preheatState_mean',
 'preheatState_min',
 'preheatState_max',
 'bias_mean',
 'bias_min',
 'bias_max',
 'warning_count_sum',
 'piezoWarningCount_Step0',
 'piezoWarningCount_Step1',
 'piezoWarningCount_Step2',
 'piezoWarningCount_Step3',
 'piezoWarningCount_Step4',
 'total_piezoWarnings',
 'we1_step66_mean',
 'we1_step66_min',
 'we1_step66_max',
 'we2_step66_mean',
 'we2_step66_min',
 'we2_step66_max',
 'we3_step66_mean',
 'we3_step66_min',
 'we3_step66_max',
 'we4_step66_mean',
 'we4_step66_min',
 'we4_step66_max',
 'we5_step66_mean',
 'we5_step66_min',
 'we5_step66_max',
 'we6_step66_mean',
 'we6_step66_min',
 'we6_step66_max',
 'we1_step91_mean',
 'we1_step91_min',
 'we1_step91_max',
 'we2_step91_mean',
 'we2_step91_min',
 'we2_step91_max',
 'we3_step91_mean',
 'we3_step91_min',
 'we3_step91_max',
 'we4_step91_mean',
 'we4_step91_min',
 'we4_step91_max',
 'we5_step91_mean',
 'we5_step91_min',
 'we5_step91_max',
 'we6_step91_mean',
 'we6_step91_min',
 'we6_step91_max',
 'we1_step93_mean',
 'we1_step93_min',
 'we1_step93_max',
 'we2_step93_mean',
 'we2_step93_min',
 'we2_step93_max',
 'we3_step93_mean',
 'we3_step93_min',
 'we3_step93_max',
 'we4_step93_mean',
 'we4_step93_min',
 'we4_step93_max',
 'we5_step93_mean',
 'we5_step93_min',
 'we5_step93_max',
 'we6_step93_mean',
 'we6_step93_min',
 'we6_step93_max',
 'we1_step66_tailmean',
 'we2_step66_tailmean',
 'we3_step66_tailmean',
 'we4_step66_tailmean',
 'we5_step66_tailmean',
 'we6_step66_tailmean']
In [17]:
# Handling our NaN values

# Filling NaN with 0
cols_to_clean=['Flu A Copies','Flu B Copies','CoV Copies', 'Copies']
merged_df[cols_to_clean] = merged_df[cols_to_clean].fillna(0)

# Remove commas and convert to numeric, coercing errors
for col in cols_to_clean:
    # Convert to string first to handle potential non-string types before replacing
    merged_df[col] = merged_df[col].astype(str).str.replace(',', '', regex=False)
    # Convert to numeric, coercing errors to NaN
    merged_df[col] = pd.to_numeric(merged_df[col], errors='coerce')

# Fill any NaNs created by coercion with 0 before converting to int
merged_df[cols_to_clean] = merged_df[cols_to_clean].fillna(0)

# Converting to int
merged_df[cols_to_clean] = merged_df[cols_to_clean].astype(int)


# Filling NaN values in WE1, WE2, WE3, WE4, WE5, and WE6 with 0

cols_to_fill = ['WE1', 'WE2', 'WE3', 'WE4', 'WE5', 'WE6']
merged_df[cols_to_fill] = merged_df[cols_to_fill].fillna(0)


# Ensuring WE_ columns are integers

merged_df['WE1'] = merged_df['WE1'].astype(int)
merged_df['WE2'] = merged_df['WE2'].astype(int)
merged_df['WE3'] = merged_df['WE3'].astype(int)
merged_df['WE4'] = merged_df['WE4'].astype(int)
merged_df['WE5'] = merged_df['WE5'].astype(int)
merged_df['WE6'] = merged_df['WE6'].astype(int)
In [18]:
# Filtering Dataset to Mfg Date after 11-Aug-2025

# print(f'Before filtering:',merged_df.shape)
# merged_df = merged_df[merged_df['Mfg Date'] > '11-Aug-2025']

# print(f'After filtering:',merged_df.shape)
In [19]:
# Checking our "Flu A Copies", "Flu B Copies", and 'CoV Copies'

print(merged_df['Flu A Copies'].value_counts())
print(merged_df['Flu B Copies'].value_counts())
print(merged_df['CoV Copies'].value_counts())
Flu A Copies
0         52716
150        3512
600        2887
200        1950
60         1181
300         847
50          615
1000        345
100         193
440          95
220          70
20           60
2290         54
330          44
200000       40
10000        40
3000         40
400          32
15000        20
10400        20
2200         15
1320         15
5000         12
30           12
125          12
153          10
612          10
306          10
75           10
51           10
1             9
720           2
Name: count, dtype: int64
Flu B Copies
0         53773
600        5397
200        3101
60         1078
300         766
100         338
30          130
400          96
50           46
150          40
200000       40
1000         12
146          10
1200         10
219          10
120          10
36           10
215           7
1             6
5             6
636           2
Name: count, dtype: int64
CoV Copies
0        55490
150       3445
600       1891
200       1469
60        1028
50         666
100        531
300        239
15000       40
37300       40
20          23
30          12
75           5
10           3
3            3
1            3
Name: count, dtype: int64
In [20]:
# Creating target-specific dataframes

# # Group data by 'Test Sample' including 'Donor'
# df_fluA = merged_df[(merged_df['Test Sample'] == 'FluA') | (merged_df['Test Sample'] == 'Donor')]
# df_fluB = merged_df[(merged_df['Test Sample'] == 'FluB') | (merged_df['Test Sample'] == 'Donor')]
# df_cov = merged_df[(merged_df['Test Sample'] == 'CoV') | (merged_df['Test Sample'] == 'Donor')]

# Group data by 'Test Sample' excluding 'Donor'
df_fluA = merged_df[merged_df['Test Sample'] == 'FluA']
df_fluB = merged_df[merged_df['Test Sample'] == 'FluB']
df_cov = merged_df[merged_df['Test Sample'] == 'CoV']
df_ntc = merged_df[merged_df['Test Sample'] == 'NTC']


# Dropping columns from df_fluA, df_fluB, and df_cov

# FluA drops
cols_to_drop_A=['Flu B Strain',
 'Flu B Copies',
 'Flu B Lot',
 'CoV Strain',
 'CoV Copies',
 'CoV Lot'
    ]

df_fluA = df_fluA.drop(columns=cols_to_drop_A)

# FluB drops
cols_to_drop_B=['Flu A Strain',
 'Flu A Copies',
 'Flu A Lot',
 'CoV Strain',
 'CoV Copies',
 'CoV Lot'
    ]

df_fluB = df_fluB.drop(columns=cols_to_drop_B)

# CoV drops
cols_to_drop_C=['Flu B Strain',
 'Flu B Copies',
 'Flu B Lot',
 'Flu A Strain',
 'Flu A Copies',
 'Flu A Lot'
    ]

df_cov = df_cov.drop(columns=cols_to_drop_C)


# Renaming _Strain, _Copies, and _Lot in df_fluA, df_fluB, and df_cov

df_fluA.rename(columns={
    'Flu A Strain': 'Virus_Strain',
    'Flu A Copies': 'Virus_Copies',
    'Flu A Lot': 'Virus_Lot'
}, inplace=True)

df_fluB.rename(columns={
    'Flu B Strain': 'Virus_Strain',
    'Flu B Copies': 'Virus_Copies',
    'Flu B Lot': 'Virus_Lot'
}, inplace=True)

df_cov.rename(columns={
    'CoV Strain': 'Virus_Strain',
    'CoV Copies': 'Virus_Copies',
    'CoV Lot': 'Virus_Lot'
}, inplace=True)
In [21]:
# Checking our "Flu A Copies", "Flu B Copies", and 'CoV Copies'

print(f'FluA Copies:', df_fluA['Virus_Copies'].value_counts())
print(f'FluB Copies:',df_fluB['Virus_Copies'].value_counts())
print(f'CoV Copies:',df_cov['Virus_Copies'].value_counts())
FluA Copies: Virus_Copies
150      3303
600      2809
200      1910
60       1181
300       847
50        595
1000      345
0         228
100       189
440        95
220        70
20         60
330        44
10000      40
3000       40
400        32
10400      20
1320       15
2200       15
30         12
5000       12
125        12
612        10
306        10
153        10
75         10
51         10
1           9
720         2
Name: count, dtype: int64
FluB Copies: Virus_Copies
600     5108
200     2943
60      1078
300      766
0        537
100      338
30       130
400       96
50        46
1000      12
146       10
1200      10
219       10
120       10
36        10
5          6
1          6
215        3
636        2
Name: count, dtype: int64
CoV Copies: Virus_Copies
150    3182
600    1831
200    1429
60     1028
0      1023
50      666
100     527
300     223
20       23
30       12
75        5
10        3
3         3
1         3
Name: count, dtype: int64
In [22]:
# Creating the FN and FP columns for the target-specific dataframes

# fluA
# Create binary columns based on threshold logic
df_fluA["FN"] = (df_fluA["WE3"] > -3000).astype(int)

df_fluA["WE2_FP"] = (df_fluA["WE2"] <= -3000).astype(int)

df_fluA["WE5_FP"] = (df_fluA["WE5"] <= -3000).astype(int)


# FluB
df_fluB["FN"] = (df_fluB["WE2"] > -3000).astype(int)

df_fluB["WE3_FP"] = (df_fluB["WE3"] <= -3000).astype(int)

df_fluB["WE5_FP"] = (df_fluB["WE5"] <= -3000).astype(int)

# Cov
df_cov["FN"] = (df_cov["WE5"] > -3000).astype(int)

df_cov["WE3_FP"] = (df_cov["WE3"] <= -3000).astype(int)

df_cov["WE2_FP"] = (df_cov["WE2"] <= -3000).astype(int)

# NTC
df_ntc["WE2_FP"] = (df_ntc["WE2"] <= -3000).astype(int)
df_ntc['WE3_FP'] = (df_ntc['WE3'] <= -3000).astype(int)
df_ntc['WE5_FP'] = (df_ntc['WE5'] <= -3000).astype(int)
In [23]:
merged_df['we1_step66_tailmean'].unique()
Out[23]:
array([-29.91069133, -39.97587943, -31.55024073, ...,  -3.0575374 ,
       -24.0171778 , -18.78834653])
In [24]:
# Looking for instances in the merged_df where 'we1_step66_mean' is less than -1000 and WE4 is less than -3000

merged_df[(merged_df['we1_step66_tailmean'] < -1000) & (merged_df['WE4'] < -3000)]
Out[24]:
Study Label Pellet Variable A Variable B Variable C Variable D Procedure Test Sample RP Type RP Cell Count ... we5_step93_max we6_step93_mean we6_step93_min we6_step93_max we1_step66_tailmean we2_step66_tailmean we3_step66_tailmean we4_step66_tailmean we5_step66_tailmean we6_step66_tailmean
6591 FABC-0324 FABC-Tetra-20250917-MT-HF-T6K-A LoB day 2, 58037H VVP-1128_NTC_SM_2 NaN NaN FABC-1.0.0 NTC SCM - ... 0.0 0.000000 0.0000 0.0 -15661.002850 0.0 0.0 0.0 0.0 0.0
10741 FABC-0314 FABC-Tetra-20250908-MT-HF-T2K-A Tecan 160.02, build 2 t=2wk, 30C NaN NaN FABC-1.0.0 NTC SCM NaN ... 0.0 -566.723064 -2623.4622 0.0 -10620.319193 0.0 0.0 0.0 0.0 0.0
15443 FABC-0296 FABC-Tetra-20250828-MT-HF-T6K-A Tecan 155 (T142) 40C 1 week NaN NaN FABC-005-13Aug25 CoV SCM - ... 0.0 -5802.806436 -8847.3590 0.0 -1223.299247 0.0 0.0 0.0 0.0 0.0
17220 FABC-0267 Wax82-FABC-Tetra-20250730-MG-HF-T6K-A 1X PDDNM + Virus (PTC, 200 cp/wand) NaN NaN NaN 745 MUX (666 Flu+CoV PID, modified RV flow check) FluB PDDNM NaN ... NaN NaN NaN NaN -14786.362714 0.0 0.0 0.0 0.0 0.0
27166 FABC-0314 FABC-Tetra-20250908-MT-HF-T2K-A Tecan 160.02, build 2 t=2wk, 30C NaN NaN FABC-1.0.0 NTC SCM NaN ... 0.0 -566.723064 -2623.4622 0.0 -10620.319193 0.0 0.0 0.0 0.0 0.0
28689 FABC-0324 FABC-Tetra-20250917-MT-HF-T6K-A LoB day 2, 58037H VVP-1128_NTC_SM_2 NaN NaN FABC-1.0.0 NTC SCM - ... 0.0 0.000000 0.0000 0.0 -15661.002850 0.0 0.0 0.0 0.0 0.0
31830 FABC-0296 FABC-Tetra-20250828-MT-HF-T6K-A Tecan 155 (T142) 40C 1 week NaN NaN FABC-005-13Aug25 CoV SCM - ... 0.0 -5802.806436 -8847.3590 0.0 -1223.299247 0.0 0.0 0.0 0.0 0.0
39152 FABC-0267 Wax82-FABC-Tetra-20250730-MG-HF-T6K-A 1X PDDNM + Virus (PTC, 200 cp/wand) NaN NaN NaN 745 MUX (666 Flu+CoV PID, modified RV flow check) FluB PDDNM NaN ... NaN NaN NaN NaN -14786.362714 0.0 0.0 0.0 0.0 0.0
50776 FABC-0232 Wax108-FABC-Tetra-20250609-MG-HF-T6K-A Wash Leak T=2; 40C NaN NaN 745 MUX (666 Flu+CoV PID, modified RV flow check) NTC A549 Cells - ... NaN NaN NaN NaN -3793.752833 0.0 0.0 0.0 0.0 0.0
50866 FABC-0232 Wax108-FABC-Tetra-20250609-MG-HF-T6K-A TMB Leak T=2; 40C NaN NaN 745 MUX (666 Flu+CoV PID, modified RV flow check) NTC A549 Cells - ... NaN NaN NaN NaN -2676.759767 0.0 0.0 0.0 0.0 0.0

10 rows × 173 columns

In [25]:
# Checking 'Procedure Name' values within the dataset

merged_df['Procedure'].value_counts()
Out[25]:
count
Procedure
745 MUX (666 Flu+CoV PID, modified RV flow check) 34148
FABC-1.0.0 28862
FABC-005-13Aug25 1630
Custom Needed 240
FABC-003-R&D-16Apr25-PID-15MinMT-FeEDTA-Concurrent 8

In [26]:
# Keeping only 745 MUX (666 Flu+CoV PID, modified RV flow check), FABC-1.0.0, and FABC-005-13Aug25

merged_df = merged_df[merged_df['Procedure'].isin(['745 MUX (666 Flu+CoV PID, modified RV flow check)', 'FABC-1.0.0', 'FABC-005-13Aug25'])]
merged_df.shape
Out[26]:
(64640, 173)

WE1 PreCheck Binning Analysis¶

In [27]:
# Visualizing 'we1_step66_tailmean values as a function of WE4 values

plt.figure(figsize=(10, 6))
plt.scatter(merged_df['WE4'], merged_df['we1_step66_tailmean'], alpha=0.5)
plt.xlabel('WE4')
plt.ylabel('we1_step66_tailmean')
plt.title('Scatter Plot of we1_step66_tailmean vs. WE4')
# horizontal line at -1000
plt.axhline(y=-1000, color='r', linestyle='--')
# vertical line at -3000
plt.axvline(x=-3000, color='g', linestyle='--')
plt.grid(True)
plt.show()
No description has been provided for this image
In [28]:
# Getting a count of values in each bin in the histplot

merged_df['we1_step66_tailmean'].value_counts()
Out[28]:
count
we1_step66_tailmean
0.000000 189
-4.652774 52
-4.652774 49
-7.311503 47
-6.646821 45
... ...
-11.299594 1
-11.919965 1
-33.376532 1
-31.284368 1
-11.166658 1

26763 rows × 1 columns


In [29]:
# Standardizing 'Date Tested' format

merged_df['Date Tested'] = pd.to_datetime(merged_df['Date Tested'], format='mixed')

# Standardizing "Reader SN" strings

merged_df['Reader SN'] = merged_df['Reader SN'].astype(str)
merged_df['Reader SN'] = merged_df['Reader SN'].str.strip()
merged_df['Reader SN'] = merged_df['Reader SN'].str.replace(r'\.0$', '', regex=True)
In [30]:
# drop duplicated

merged_df = merged_df.drop_duplicates()
In [31]:
# Binning the WE1 precheck data

col = "we1_step66_tailmean"
data = merged_df[col].dropna()

# Define your bins and labels
bins = [float("-inf"), -1000, -500, -300, -150, -70, 0, float("inf")]
labels = [
    "< -1000",
    "-1000 to -501",
    "-500 to -301",
    "-300 to -151",
    "-150 to -71",
    "-70 to 0",
    "> 0"
]

# Apply binning
binned = pd.cut(data, bins=bins, labels=labels, include_lowest=True)
summary = binned.value_counts().sort_index().reset_index()
summary.columns = ["bin_range", "count"]

# Add percentages
summary["percent"] = (summary["count"] / summary["count"].sum() * 100).round(2)

print(summary)
       bin_range  count  percent
0        < -1000    249     0.48
1  -1000 to -501    138     0.26
2   -500 to -301    109     0.21
3   -300 to -151    678     1.30
4    -150 to -71   6774    12.94
5       -70 to 0  44386    84.81
6            > 0      0     0.00
In [32]:
# Visualizing the bins

plt.figure(figsize=(8,4))
plt.bar(summary["bin_range"], summary["percent"])
plt.title(f"Distribution of {col}")
plt.xlabel("Value Range")
plt.ylabel("Percent of total")
plt.xticks(rotation=30)
# Annotating percentage above each bar
for i, v in enumerate(summary["percent"]):
    plt.text(i, v, f"{v}%", ha="center", va="bottom")
plt.show()
No description has been provided for this image
In [33]:
# Looking at WE1, WE2, WE3, WE4, WE5, and WE6 values of my individual bins

# Add the binned data as a new column to merged_df
merged_df['we1_bins'] = binned

# Group by the created bins and describe the WE columns
we_analysis = merged_df.groupby('we1_bins')[['WE1', 'WE2', 'WE3', 'WE4', 'WE5', 'WE6']].describe()
display(we_analysis)
WE1 WE2 ... WE5 WE6
count mean std min 25% 50% 75% max count mean ... 75% max count mean std min 25% 50% 75% max
we1_bins
< -1000 249.0 -15978.345382 10692.977494 -43561.0 -23145.00 -15623.0 -8034.00 0.0 249.0 -5670.518072 ... -66.00 0.0 249.0 -15881.923695 5484.392109 -33526.0 -19200.00 -15273.0 -13232.00 -120.0
-1000 to -501 138.0 -13230.224638 9847.550151 -31245.0 -20889.00 -14310.0 -973.00 0.0 138.0 -2700.333333 ... -74.25 0.0 138.0 -13380.681159 6045.181911 -23341.0 -17537.50 -14309.0 -11273.00 -91.0
-500 to -301 109.0 -10666.247706 9395.899591 -30884.0 -18216.00 -11188.0 -422.00 0.0 109.0 -1949.018349 ... -69.00 0.0 109.0 -12866.000000 5921.610329 -27830.0 -15895.00 -13696.0 -9591.00 -166.0
-300 to -151 678.0 -12499.346608 7285.822723 -31324.0 -16851.00 -13814.0 -8557.75 0.0 678.0 -1920.556047 ... -100.00 0.0 678.0 -15342.626844 4768.705834 -30446.0 -18414.00 -15799.5 -12659.25 0.0
-150 to -71 6774.0 -12888.855920 7711.826035 -33257.0 -18173.75 -14050.5 -8147.00 0.0 6774.0 -2447.797756 ... -92.00 0.0 6774.0 -16157.456156 3610.766955 -30605.0 -18418.50 -16219.5 -14153.25 0.0
-70 to 0 44386.0 -14523.415762 8569.709100 -36065.0 -21003.00 -15958.5 -8722.25 0.0 44386.0 -2773.260848 ... -78.00 0.0 44386.0 -15508.747601 3831.769741 -33932.0 -17953.75 -15432.0 -13182.00 0.0
> 0 0.0 NaN NaN NaN NaN NaN NaN NaN 0.0 NaN ... NaN NaN 0.0 NaN NaN NaN NaN NaN NaN NaN

7 rows × 48 columns

In [34]:
import matplotlib.pyplot as plt

# Extract just the mean rows for each WE column per bin
we_means = merged_df.groupby('we1_bins')[['WE1', 'WE2', 'WE3', 'WE4', 'WE5', 'WE6']].mean()

# Optional: ensure bins appear in logical order (based on your label order)
we_means = we_means.reindex(labels)

# Display table of mean values
display(we_means)

# --- Plot setup ---
plt.figure(figsize=(8,5))
for col in we_means.columns:
    plt.plot(we_means.index, we_means[col], marker='o', label=col)

plt.title("Mean WE# Values by we1_bin")
plt.xlabel("WE1 Bins")
plt.ylabel("Mean Value")
plt.legend(title="WE#")
plt.legend(bbox_to_anchor=(1.2, 1), loc='upper right'),
plt.xticks(rotation=30)
plt.grid(True)
plt.tight_layout()
plt.show()
WE1 WE2 WE3 WE4 WE5 WE6
we1_bins
< -1000 -15978.345382 -5670.518072 -3971.329317 -629.281124 -2961.257028 -15881.923695
-1000 to -501 -13230.224638 -2700.333333 -1571.891304 -303.630435 -1974.449275 -13380.681159
-500 to -301 -10666.247706 -1949.018349 -3278.009174 -674.651376 -2386.935780 -12866.000000
-300 to -151 -12499.346608 -1920.556047 -4687.379056 -368.998525 -3671.064897 -15342.626844
-150 to -71 -12888.855920 -2447.797756 -4718.284027 -199.943313 -3228.346619 -16157.456156
-70 to 0 -14523.415762 -2773.260848 -4218.323007 -183.461204 -2742.110936 -15508.747601
> 0 NaN NaN NaN NaN NaN NaN
No description has been provided for this image
In [35]:
import matplotlib.pyplot as plt
import numpy as np

we_means = merged_df.groupby('we1_bins')[['WE1', 'WE2', 'WE3', 'WE4', 'WE5', 'WE6']].mean()
we_means = we_means.reindex(labels)  # ensure correct bin order

bins = we_means.index
we_cols = we_means.columns
x = np.arange(len(bins))  # the label locations
width = 0.13  # width of each bar

plt.figure(figsize=(10,5))
for i, col in enumerate(we_cols):
    plt.bar(x + i*width, we_means[col], width=width, label=col)

plt.xticks(x + width*2.5, bins, rotation=30)
plt.ylabel("Mean WE# Value")
plt.xlabel("WE1 Bins")
plt.title("Mean WE# Values per Bin")
plt.legend(title="WE#")
plt.legend(bbox_to_anchor=(1.11, 1), loc='upper right')
plt.hlines(y=-3000, xmin=x.min(), xmax=x.max(), linestyles='dashed')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [36]:
import seaborn as sns

plt.figure(figsize=(8,6))
sns.heatmap(we_means, annot=True, fmt=".1f", cmap="coolwarm", cbar_kws={'label': 'Mean Value'})
plt.title("Mean WE# Values by we1_bin")
plt.ylabel("WE1 Bins")
plt.xlabel("WE#")
plt.xticks(rotation=0)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [37]:
import pandas as pd

# 1️⃣ Compute mean WE values per bin
we_means = merged_df.groupby('we1_bins')[['WE1', 'WE2', 'WE3', 'WE4', 'WE5', 'WE6']].mean()

# Ensure bins are in the correct order
we_means = we_means.reindex(labels)

# 2️⃣ Count samples per bin
bin_counts = merged_df['we1_bins'].value_counts().reindex(labels)
bin_counts.name = "count"

# 3️⃣ Identify the control bin
control_bin = "-70 to 0"
control_values = we_means.loc[control_bin]

# 4️⃣ Compute Delta_WE# relative to control
delta_cols = {col: f"Delta_{col}" for col in we_means.columns}
delta_values = we_means - control_values
delta_values.rename(columns=delta_cols, inplace=True)

# 5️⃣ Combine counts, means, and deltas into one summary DataFrame
summary_df = pd.concat([bin_counts, we_means, delta_values], axis=1).reset_index()
summary_df.rename(columns={"we1_bins": "bin_range"}, inplace=True)

# 6️⃣ Format numeric columns to 1 decimal place
we_cols = ['WE1','WE2','WE3','WE4','WE5','WE6']
delta_cols = [f"Delta_{col}" for col in we_cols]
summary_df[we_cols + delta_cols] = summary_df[we_cols + delta_cols].round(1)

# 7️⃣ Display the table
display(summary_df)
bin_range count WE1 WE2 WE3 WE4 WE5 WE6 Delta_WE1 Delta_WE2 Delta_WE3 Delta_WE4 Delta_WE5 Delta_WE6
0 < -1000 249 -15978.3 -5670.5 -3971.3 -629.3 -2961.3 -15881.9 -1454.9 -2897.3 247.0 -445.8 -219.1 -373.2
1 -1000 to -501 138 -13230.2 -2700.3 -1571.9 -303.6 -1974.4 -13380.7 1293.2 72.9 2646.4 -120.2 767.7 2128.1
2 -500 to -301 109 -10666.2 -1949.0 -3278.0 -674.7 -2386.9 -12866.0 3857.2 824.2 940.3 -491.2 355.2 2642.7
3 -300 to -151 678 -12499.3 -1920.6 -4687.4 -369.0 -3671.1 -15342.6 2024.1 852.7 -469.1 -185.5 -929.0 166.1
4 -150 to -71 6774 -12888.9 -2447.8 -4718.3 -199.9 -3228.3 -16157.5 1634.6 325.5 -500.0 -16.5 -486.2 -648.7
5 -70 to 0 44386 -14523.4 -2773.3 -4218.3 -183.5 -2742.1 -15508.7 0.0 0.0 0.0 0.0 0.0 0.0
6 > 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [38]:
# Plot the Delta WE's

delta_only = summary_df.set_index('bin_range')[delta_cols]

plt.figure(figsize=(8,6))
sns.heatmap(delta_only, annot=True, fmt=".1f", cmap="coolwarm", cbar_kws={'label': 'Delta Value'})
plt.title("Delta WE# Values Relative to Control Bin (-70 to 0)")
plt.ylabel("WE1 Bins")
plt.xlabel("WE#")
plt.xticks(rotation=0)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [39]:
# Extracting cartridge information for cartridges where we1_step66_tailmean is between -500 and -301

cartridge_serial_numbers = merged_df[(merged_df['we1_step66_tailmean'] > -500) & (merged_df['we1_step66_tailmean'] < -301)]
print(f'Unique cartridges are:', cartridge_serial_numbers['cartridgeSerialNumber'].head(15))
Unique cartridges are: 4271     137212809
12290    137119509
16741    137168719
16765    137166595
16986    137167116
17126    137162146
17648    137165768
19512    137159790
19723    137151850
20809    137160522
21668    137155036
21819    137155417
21869    137155281
22254    137142537
22323    137142464
Name: cartridgeSerialNumber, dtype: object
In [40]:
# Looking at the runtime details of the cartridge_serial_numbers (-301 to -501 WE1 precheck bin)

cols_to_keep=[
    'cartridgeSerialNumber',
    'Pellet',
    'Procedure',
    'we1_step66_tailmean',
    'Lot Number',
 'Reader SN',
 'Mfg Date',
 'Date Tested',
    'Warnings',
 'Errors',
    'WE1',
    'WE2',
    'WE3',
    'WE4',
    'WE5',
    'WE6'
]

cartridge_serial_numbers_filtered = cartridge_serial_numbers[cols_to_keep]
cartridge_serial_numbers_filtered.head(18)
Out[40]:
cartridgeSerialNumber Pellet Procedure we1_step66_tailmean Lot Number Reader SN Mfg Date Date Tested Warnings Errors WE1 WE2 WE3 WE4 WE5 WE6
4271 137212809 FABC-Tetra-20251001-MT-HF-HD-G FABC-1.0.0 -356.940495 R58052H 12202010090368 03-Oct-2025 2025-10-06 NaN NaN -20173 -3273 -566 -283 -6967 -16275
12290 137119509 NaN FABC-1.0.0 -328.445716 R57584I 12102010131018 06-Jun-2025 2025-09-16 NaN NaN -13828 -139 -246 -127 -242 -15156
16741 137168719 Wax82-FABC-Tetra-20250806-MT-HF-T6K-A 745 MUX (666 Flu+CoV PID, modified RV flow check) -408.975601 R57910H 12202010082982 08-Aug-2025 2025-08-08 NaN NaN -15684 -689 -234 -353 -54 -13312
16765 137166595 Wax82-FABC-Tetra-20250805-MG-HF-T6K-A 745 MUX (666 Flu+CoV PID, modified RV flow check) -424.605121 R57906H 12002010004205 07-Aug-2025 2025-08-07 NaN NaN -14405 -71 -124 -62 -62 -12660
16986 137167116 Wax82-FABC-Tetra-20250805-MT-HF-T6K-B 745 MUX (666 Flu+CoV PID, modified RV flow check) -311.407692 R57908H 12202010013132 07-Aug-2025 2025-08-08 NaN NaN -21679 -65 -209 -102 -56 -15552
17126 137162146 Wax82-FABC-Tetra-20250730-MG-HF-T6K-A 745 MUX (666 Flu+CoV PID, modified RV flow check) -473.070538 R57902H 12102010081782 04-Aug-2025 2025-08-08 NaN NaN -1124 -219 -14371 -58 -59 -13696
17648 137165768 Wax82-FABC-Tetra-20250804-MG-HF-T6K-A 745 MUX (666 Flu+CoV PID, modified RV flow check) -375.108464 R57904H 12102010161207 06-Aug-2025 2025-08-06 NaN NaN -20720 -128 -189 -116 -58 -12649
19512 137159790 FABC-Tetra-20250711-MT-HF-HD-B 745 MUX (666 Flu+CoV PID, modified RV flow check) -447.292945 R57839H 12102010138374 31-Jul-2025 2025-08-05 NaN NaN -9711 -144 -290 -251 -114 -16697
19723 137151850 NaN 745 MUX (666 Flu+CoV PID, modified RV flow check) -387.990639 R57853H 12102010133232 22-Jul-2025 2025-08-05 NaN NaN -15273 -4 -4 -55 -24165 -13178
20809 137160522 Wax82-FABC-Tetra-20250721-MG-HF-T6K-A 745 MUX (666 Flu+CoV PID, modified RV flow check) -451.458281 R57901H 12102010089679 01-Aug-2025 2025-08-11 [80] NaN -185 -241 -182 -251 -177 -9169
21668 137155036 Wax82-FABC-Tetra-20250718-MG-HF-T6K-A 745 MUX (666 Flu+CoV PID, modified RV flow check) -477.391277 R57872H 12202010047848 25-Jul-2025 2025-08-13 NaN NaN -5505 -343 -19896 -66 -56 -9538
21819 137155417 Wax82-FABC-Tetra-20250718-MG-HF-T6K-A 745 MUX (666 Flu+CoV PID, modified RV flow check) -464.707606 R57873H 12102010129228 25-Jul-2025 2025-08-11 NaN NaN -26580 -266 -501 -262 -5685 -13759
21869 137155281 Wax82-FABC-Tetra-20250718-MG-HF-T6K-A 745 MUX (666 Flu+CoV PID, modified RV flow check) -334.244725 R57873H 12202010047850 25-Jul-2025 2025-08-13 NaN NaN -10321 -156 -24420 -41 -69 -11781
22254 137142537 Wax108-FABC-Tetra-20250630-MG-HF-T6K-A 745 MUX (666 Flu+CoV PID, modified RV flow check) -393.308098 R57776H 12102010123927 08-Jul-2025 2025-08-11 [80] NaN -248 -245 -226 -164 -217 -11114
22323 137142464 Wax108-FABC-Tetra-20250630-MG-HF-T6K-A FABC-1.0.0 -334.295049 R57776H 12202010090951 08-Jul-2025 2025-10-02 NaN NaN -1493 -227 -143 -90 -188 -18742
22361 137142977 Wax108-FABC-Tetra-20250630-MG-HF-T6K-A FABC-1.0.0 -331.902057 R57776H 12202010090073 08-Jul-2025 2025-10-02 NaN NaN -14987 -132 -2587 -85 -130 -15895
27350 137142464 Wax108-FABC-Tetra-20250630-MG-HF-T6K-A FABC-1.0.0 -334.295049 R57776H 12202010090951 08-Jul-2025 2025-10-02 NaN NaN -1493 -227 -143 -90 -188 -18742
27384 137142977 Wax108-FABC-Tetra-20250630-MG-HF-T6K-A FABC-1.0.0 -331.902057 R57776H 12202010090073 08-Jul-2025 2025-10-02 NaN NaN -14987 -132 -2587 -85 -130 -15895
In [41]:
# Extracting cartridge information for cartridges where we1_step66_tailmean is between -1000 and -501

cartridge_serial_numbers_1000 = merged_df[(merged_df['we1_step66_tailmean'] > -1000) & (merged_df['we1_step66_tailmean'] < -501)]
print(f'Unique cartridges are:', cartridge_serial_numbers_1000['cartridgeSerialNumber'].head(15))
Unique cartridges are: 1619     137222417
9726     137199665
14659    153480814
15269    153480497
16662    137167715
16911    137166548
17231    137163128
17322    137168912
17663    137165913
17685    137165820
17850    137165882
17866    137165016
17868    137165642
19597    137159320
19994    137163278
Name: cartridgeSerialNumber, dtype: object
In [42]:
# Looking at the runtime details of the cartridge_serial_numbers (-501 to -1000 WE1 precheck bin)

cols_to_keep=[
    'cartridgeSerialNumber',
    'Pellet',
    'Procedure',
    'we1_step66_tailmean',
    'Lot Number',
 'Reader SN',
 'Mfg Date',
 'Date Tested',
    'Warnings',
 'Errors',
    'WE1',
    'WE2',
    'WE3',
    'WE4',
    'WE5',
    'WE6'
]

cartridge_serial_numbers_1000_filtered = cartridge_serial_numbers_1000[cols_to_keep]
cartridge_serial_numbers_1000_filtered.head(18)
Out[42]:
cartridgeSerialNumber Pellet Procedure we1_step66_tailmean Lot Number Reader SN Mfg Date Date Tested Warnings Errors WE1 WE2 WE3 WE4 WE5 WE6
1619 137222417 FABC-Tetra-20251007-KS-HF-T2K-A FABC-1.0.0 -552.093228 R58113H 12202010089460 10-Oct-2025 2025-10-14 NaN NaN -11253 -142 -166 -74 -50 -7174
9726 137199665 FABC-Tetra-20250916-MT-HF-T6K-A FABC-1.0.0 -660.526828 R58036H 12102010082289 18-Sep-2025 2025-09-19 [95, 96, 80] NaN -507 -570 -418 -640 0 -16702
14659 153480814 NaN FABC-005-13Aug25 -677.587269 R58022I 12102010011645 10-Sep-2025 2025-09-18 [95, 96, 6, 20, 93, 80] 20.0 -555 0 0 0 0 -146
15269 153480497 FABC-Tetra-20250827-MT-HF-T6K-A FABC-005-13Aug25 -984.940964 R58016I 12102010082269 08-Sep-2025 2025-09-16 [95, 96, 20, 93] NaN -22646 -2699 -712 -172 -264 -400
16662 137167715 Wax82-FABC-Tetra-20250806-MG-HF-T6K-A 745 MUX (666 Flu+CoV PID, modified RV flow check) -814.336594 R57909H 12202010164283 08-Aug-2025 2025-08-08 NaN NaN -31245 -199 -2997 -129 -100 -13177
16911 137166548 Wax82-FABC-Tetra-20250805-MT-HF-T6K-A 745 MUX (666 Flu+CoV PID, modified RV flow check) -679.404063 R57907H 12102010080974 07-Aug-2025 2025-08-11 NaN NaN -29225 -23584 -418 -173 -85 -17279
17231 137163128 Wax82-FABC-Tetra-20250730-MG-HF-T6K-A 745 MUX (666 Flu+CoV PID, modified RV flow check) -530.424595 R57902H 12102010080870 04-Aug-2025 2025-08-11 NaN NaN -29868 -20256 -319 -1003 -179 -12167
17322 137168912 FABC-Tetra-20250723-MG-HF-T2K-C 745 MUX (666 Flu+CoV PID, modified RV flow check) -806.449043 R57911H 12102010081373 08-Aug-2025 2025-08-11 [80] NaN -991 -1333 -1246 -1462 -1221 -9549
17663 137165913 Wax82-FABC-Tetra-20250804-MG-HF-T6K-A 745 MUX (666 Flu+CoV PID, modified RV flow check) -629.863083 R57904H 12102010039201 06-Aug-2025 2025-08-06 [6] NaN -22956 -101 -364 -103 -113 -14309
17685 137165820 Wax82-FABC-Tetra-20250804-MG-HF-T6K-A 745 MUX (666 Flu+CoV PID, modified RV flow check) -517.700674 R57904H 12102010058164 06-Aug-2025 2025-08-07 [6, 80] NaN -86 -246 -88 -465 -79 -12214
17850 137165882 Wax82-FABC-Tetra-20250804-MG-HF-T6K-A 745 MUX (666 Flu+CoV PID, modified RV flow check) -869.993207 R57904H 12002010004072 06-Aug-2025 2025-08-07 NaN NaN -22026 -122 -259 -125 -63 -19725
17866 137165016 Wax82-FABC-Tetra-20250804-MG-HF-T6K-A 745 MUX (666 Flu+CoV PID, modified RV flow check) -678.119021 R57904H 12102010086647 06-Aug-2025 2025-08-07 NaN NaN -13692 -143 -2534 -59 -72 -15693
17868 137165642 Wax82-FABC-Tetra-20250804-MG-HF-T6K-A 745 MUX (666 Flu+CoV PID, modified RV flow check) -700.015542 R57904H 12102010044975 06-Aug-2025 2025-08-07 NaN NaN -15455 -285 -8311 -162 -190 -13789
19597 137159320 FABC-Tetra-20250711-MT-HF-HD-D 745 MUX (666 Flu+CoV PID, modified RV flow check) -999.076676 R57841H 12102010131220 31-Jul-2025 2025-08-05 [80] NaN -184 -224 -107 -235 -87 -23341
19994 137163278 Wax82-FABC-Tetra-20250730-MG-HF-T6K-A 745 MUX (666 Flu+CoV PID, modified RV flow check) -555.726811 R57902H 12102010074667 04-Aug-2025 2025-08-05 NaN NaN -11349 -298 -201 -143 -80 -19290
20315 137163240 Wax82-FABC-Tetra-20250730-MG-HF-T6K-A 745 MUX (666 Flu+CoV PID, modified RV flow check) -795.990810 R57902H 12102010129816 04-Aug-2025 2025-08-06 NaN NaN -29041 -164 -303 -130 -102 -14554
20615 137160781 Wax82-FABC-Tetra-20250721-MG-HF-T6K-A 745 MUX (666 Flu+CoV PID, modified RV flow check) -593.901726 R57901H 12102010125231 01-Aug-2025 2025-08-01 NaN NaN -21507 -181 -381 -214 -88 -19497
20627 137160526 Wax82-FABC-Tetra-20250721-MG-HF-T6K-A 745 MUX (666 Flu+CoV PID, modified RV flow check) -928.309345 R57901H 12202010055315 01-Aug-2025 2025-08-01 [80] NaN -122 -165 -77 -715 -153 -14454

FluA WE1 Precheck Binning Analysis¶

In [43]:
merged_df.shape
Out[43]:
(52334, 174)
In [44]:
df_fluA.shape
Out[44]:
(11935, 170)
In [45]:
# Standardizing 'Date Tested' format

df_fluA['Date Tested'] = pd.to_datetime(df_fluA['Date Tested'], format='mixed')

# Standardizing "Reader SN" strings

df_fluA['Reader SN'] = df_fluA['Reader SN'].astype(str)
df_fluA['Reader SN'] = df_fluA['Reader SN'].str.strip()
df_fluA['Reader SN'] = df_fluA['Reader SN'].str.replace(r'\.0$', '', regex=True)

# Keeping only 745 MUX (666 Flu+CoV PID, modified RV flow check), FABC-1.0.0, and FABC-005-13Aug25

df_fluA = df_fluA[df_fluA['Procedure'].isin(['745 MUX', 'FABC-1.0.0', 'FABC-005-13Aug25'])]
In [46]:
# Visualizing 'we1_step66_tailmean values as a function of WE4 values

plt.figure(figsize=(10, 6))
plt.scatter(df_fluA['WE3'], df_fluA['we1_step66_tailmean'], alpha=0.5)
plt.xlabel('WE3')
plt.ylabel('we1_step66_tailmean')
plt.title('Scatter Plot of we1_step66_tailmean vs. WE3')
# horizontal line at -1000
plt.axhline(y=-1000, color='r', linestyle='--')
# vertical line at -3000
plt.axvline(x=-3000, color='g', linestyle='--')
plt.grid(True)
plt.show()
No description has been provided for this image
In [47]:
# drop duplicated

df_fluA_filtered = df_fluA.drop_duplicates()

df_fluA_filtered.shape
Out[47]:
(4981, 170)
In [48]:
# Building df_analysis forlater analysis work

# Use a clean copy of the data
df_analysis = df_fluA.copy()  # or df_cov_filtered, depending on the dataset

# Ensure the column exists and drop NA only for the analysis
df_analysis = df_analysis[df_analysis['we1_step66_tailmean'].notna()]
In [49]:
# Binning the WE1 precheck data

col = "we1_step66_tailmean"
data = df_fluA_filtered[col].dropna()

# Define your bins and labels
bins = [float("-inf"), -1000, -500, -300, -150, -70, 0, float("inf")]
labels = [
    "< -1000",
    "-1000 to -501",
    "-500 to -301",
    "-300 to -151",
    "-150 to -71",
    "-70 to 0",
    "> 0"
]

# Apply binning
binned = pd.cut(data, bins=bins, labels=labels, include_lowest=True)
summary = binned.value_counts().sort_index().reset_index()
summary.columns = ["bin_range", "count"]

# Add percentages
summary["percent"] = (summary["count"] / summary["count"].sum() * 100).round(2)

print(summary)
       bin_range  count  percent
0        < -1000      0     0.00
1  -1000 to -501      0     0.00
2   -500 to -301      4     0.08
3   -300 to -151     68     1.37
4    -150 to -71    943    18.93
5       -70 to 0   3966    79.62
6            > 0      0     0.00
In [50]:
# Visualizing the bins

plt.figure(figsize=(8,4))
plt.bar(summary["bin_range"], summary["percent"])
plt.title(f"Distribution of {col}")
plt.xlabel("Value Range")
plt.ylabel("Percent of total")
plt.xticks(rotation=30)
# Annotating percentage above each bar
for i, v in enumerate(summary["percent"]):
    plt.text(i, v, f"{v}%", ha="center", va="bottom")
plt.show()
No description has been provided for this image
In [51]:
# Looking at WE1, WE2, WE3, WE4, WE5, and WE6 values of my individual bins

# Add the binned data as a new column to df_fluA_filtered
df_fluA_filtered['we1_bins'] = binned

# Group by the created bins and describe the WE columns
we_analysis = df_fluA_filtered.groupby('we1_bins')[['WE1', 'WE2', 'WE3', 'WE4', 'WE5', 'WE6']].describe()
display(we_analysis)
WE1 WE2 ... WE5 WE6
count mean std min 25% 50% 75% max count mean ... 75% max count mean std min 25% 50% 75% max
we1_bins
< -1000 0.0 NaN NaN NaN NaN NaN NaN NaN 0.0 NaN ... NaN NaN 0.0 NaN NaN NaN NaN NaN NaN NaN
-1000 to -501 0.0 NaN NaN NaN NaN NaN NaN NaN 0.0 NaN ... NaN NaN 0.0 NaN NaN NaN NaN NaN NaN NaN
-500 to -301 4.0 -8240.000000 7790.764532 -14987.0 -14987.0 -8240.0 -1493.0 -1493.0 4.0 -179.500000 ... -130.0 -130.0 4.0 -17318.500000 1643.716216 -18742.0 -18742.0 -17318.5 -15895.0 -15895.0
-300 to -151 68.0 -13055.264706 5814.235625 -30589.0 -16620.0 -13099.5 -9780.0 -375.0 68.0 -145.705882 ... -109.0 -54.0 68.0 -15078.676471 3961.713362 -21566.0 -18226.0 -14991.5 -11788.0 -5682.0
-150 to -71 943.0 -14960.757158 5202.243776 -33081.0 -17976.0 -14981.0 -12004.0 0.0 943.0 -199.527041 ... -86.0 0.0 943.0 -16236.235419 3379.337008 -26357.0 -18452.0 -16537.0 -14566.5 0.0
-70 to 0 3966.0 -17345.751891 6159.848349 -33082.0 -21557.5 -17633.0 -13711.0 0.0 3966.0 -225.593797 ... -70.0 0.0 3966.0 -15290.284670 3517.765340 -24972.0 -17573.5 -15401.5 -13218.0 0.0
> 0 0.0 NaN NaN NaN NaN NaN NaN NaN 0.0 NaN ... NaN NaN 0.0 NaN NaN NaN NaN NaN NaN NaN

7 rows × 48 columns

In [52]:
import matplotlib.pyplot as plt
import numpy as np

we_means = df_fluA_filtered.groupby('we1_bins')[['WE1', 'WE2', 'WE3', 'WE4', 'WE5', 'WE6']].mean()
we_means = we_means.reindex(labels)  # ensure correct bin order

bins = we_means.index
we_cols = we_means.columns
x = np.arange(len(bins))  # the label locations
width = 0.13  # width of each bar

plt.figure(figsize=(10,5))
for i, col in enumerate(we_cols):
    plt.bar(x + i*width, we_means[col], width=width, label=col)

plt.xticks(x + width*2.5, bins, rotation=30)
plt.ylabel("Mean WE# Value")
plt.xlabel("WE1 Bins")
plt.title("Mean WE# Values per Bin")
plt.legend(title="WE#")
plt.legend(bbox_to_anchor=(1.11, 1), loc='upper right')
plt.hlines(y=-3000, xmin=x.min(), xmax=x.max(), linestyles='dashed')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [53]:
import seaborn as sns

plt.figure(figsize=(8,6))
sns.heatmap(we_means, annot=True, fmt=".1f", cmap="coolwarm", cbar_kws={'label': 'Mean Value'})
plt.title("Mean WE# Values by we1_bin")
plt.ylabel("WE1 Bins")
plt.xlabel("WE#")
plt.xticks(rotation=0)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [54]:
import pandas as pd

# 1️⃣ Compute mean WE values per bin
we_means = df_fluA_filtered.groupby('we1_bins')[['WE1', 'WE2', 'WE3', 'WE4', 'WE5', 'WE6']].mean()

# Ensure bins are in the correct order
we_means = we_means.reindex(labels)

# 2️⃣ Count samples per bin
bin_counts = df_fluA_filtered['we1_bins'].value_counts().reindex(labels)
bin_counts.name = "count"

# 3️⃣ Identify the control bin
control_bin = "-70 to 0"
control_values = we_means.loc[control_bin]

# 4️⃣ Compute Delta_WE# relative to control
delta_cols = {col: f"Delta_{col}" for col in we_means.columns}
delta_values = we_means - control_values
delta_values.rename(columns=delta_cols, inplace=True)

# 5️⃣ Combine counts, means, and deltas into one summary DataFrame
summary_df = pd.concat([bin_counts, we_means, delta_values], axis=1).reset_index()
summary_df.rename(columns={"we1_bins": "bin_range"}, inplace=True)

# 6️⃣ Format numeric columns to 1 decimal place
we_cols = ['WE1','WE2','WE3','WE4','WE5','WE6']
delta_cols = [f"Delta_{col}" for col in we_cols]
summary_df[we_cols + delta_cols] = summary_df[we_cols + delta_cols].round(1)

# 7️⃣ Display the table
display(summary_df)
bin_range count WE1 WE2 WE3 WE4 WE5 WE6 Delta_WE1 Delta_WE2 Delta_WE3 Delta_WE4 Delta_WE5 Delta_WE6
0 < -1000 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 -1000 to -501 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 -500 to -301 4 -8240.0 -179.5 -1365.0 -87.5 -159.0 -17318.5 9105.8 46.1 19529.7 46.1 -13.7 -2028.2
3 -300 to -151 68 -13055.3 -145.7 -20466.4 -160.1 -133.6 -15078.7 4290.5 79.9 428.2 -26.5 11.7 211.6
4 -150 to -71 943 -14960.8 -199.5 -21239.1 -144.4 -188.7 -16236.2 2385.0 26.1 -344.4 -10.8 -43.4 -946.0
5 -70 to 0 3966 -17345.8 -225.6 -20894.7 -133.6 -145.3 -15290.3 0.0 0.0 0.0 0.0 0.0 0.0
6 > 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [55]:
# Plot the Delta WE's

delta_only = summary_df.set_index('bin_range')[delta_cols]

plt.figure(figsize=(8,6))
sns.heatmap(delta_only, annot=True, fmt=".1f", cmap="coolwarm", cbar_kws={'label': 'Delta Value'})
plt.title("Delta WE# Values Relative to Control Bin (-70 to 0)")
plt.ylabel("WE1 Bins")
plt.xlabel("WE#")
plt.xticks(rotation=0)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [56]:
import pandas as pd
import numpy as np

# --- 1️⃣ Compute mean WE values per bin ---
we_means = df_fluA_filtered.groupby('we1_bins')[['WE1', 'WE2', 'WE3', 'WE4', 'WE5', 'WE6']].mean()
we_means = we_means.reindex(labels)

# --- 2️⃣ Count samples per bin ---
bin_counts = df_fluA_filtered['we1_bins'].value_counts().reindex(labels)
bin_counts.name = "count"

# --- 3️⃣ Identify control bin ---
control_bin = "-70 to 0"
control_values = we_means.loc[control_bin]

# --- 4️⃣ Compute ΔWE relative to control ---
delta_cols = {col: f"Delta_{col}" for col in we_means.columns}
delta_values = we_means - control_values
delta_values.rename(columns=delta_cols, inplace=True)

# --- 5️⃣ Cartridge-level status determination ---
def classify_status(row):
    """Return Valid/Invalid based on warnings/errors presence."""
    has_issue = False
    for col in ['Warnings', 'Errors']:
        if col in row and pd.notna(row[col]) and str(row[col]).strip():
            has_issue = True
            break
    return "Invalid" if has_issue else "Valid"

df_fluA_filtered['status'] = df_fluA_filtered.apply(classify_status, axis=1)

# --- 6️⃣ Define helper flags (FN, WE2_FP, WE5_FP) ---
# Assuming these flags are boolean or identifiable by name in columns:
for flag in ['FN', 'WE2_FP', 'WE5_FP']:
    if flag not in df_fluA_filtered.columns:
        df_fluA_filtered[flag] = False  # ensure column exists

# --- 7️⃣ Compute percentages per bin ---
def compute_percentages(group):
    n = len(group)
    valid = (group['status'] == 'Valid').sum()
    invalid = (group['status'] == 'Invalid').sum()
    fn = group['FN'].sum()
    we2_fp = group['WE2_FP'].sum()
    we5_fp = group['WE5_FP'].sum()
    return pd.Series({
        '% Valid': 100 * valid / n if n else np.nan,
        '% Invalid': 100 * invalid / n if n else np.nan,
        '% FN': 100 * fn / n if n else np.nan,
        '% WE2 FP': 100 * we2_fp / n if n else np.nan,
        '% WE5 FP': 100 * we5_fp / n if n else np.nan
    })

percentages = df_fluA_filtered.groupby('we1_bins').apply(compute_percentages).reindex(labels)

# --- 8️⃣ Combine all results into one summary table ---
summary_df = pd.concat([bin_counts, we_means, delta_values, percentages], axis=1).reset_index()
summary_df.rename(columns={"we1_bins": "bin_range"}, inplace=True)

# --- 9️⃣ Format numeric columns to 1 decimal place ---
we_cols = ['WE1','WE2','WE3','WE4','WE5','WE6']
delta_cols = [f"Delta_{col}" for col in we_cols]
summary_df[we_cols + delta_cols] = summary_df[we_cols + delta_cols].round(1)
summary_df[['% Valid', '% Invalid', '% FN', '% WE2 FP', '% WE5 FP']] = \
    summary_df[['% Valid', '% Invalid', '% FN', '% WE2 FP', '% WE5 FP']].round(1)

# --- 🔟 Display the final summary ---
display(summary_df)
bin_range count WE1 WE2 WE3 WE4 WE5 WE6 Delta_WE1 Delta_WE2 Delta_WE3 Delta_WE4 Delta_WE5 Delta_WE6 % Valid % Invalid % FN % WE2 FP % WE5 FP
0 < -1000 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 -1000 to -501 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 -500 to -301 4 -8240.0 -179.5 -1365.0 -87.5 -159.0 -17318.5 9105.8 46.1 19529.7 46.1 -13.7 -2028.2 100.0 0.0 100.0 0.0 0.0
3 -300 to -151 68 -13055.3 -145.7 -20466.4 -160.1 -133.6 -15078.7 4290.5 79.9 428.2 -26.5 11.7 211.6 94.1 5.9 5.9 0.0 0.0
4 -150 to -71 943 -14960.8 -199.5 -21239.1 -144.4 -188.7 -16236.2 2385.0 26.1 -344.4 -10.8 -43.4 -946.0 92.2 7.8 7.7 0.2 0.6
5 -70 to 0 3966 -17345.8 -225.6 -20894.7 -133.6 -145.3 -15290.3 0.0 0.0 0.0 0.0 0.0 0.0 92.3 7.7 13.6 0.7 0.4
6 > 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [57]:
# --- 11️⃣ Deep dive: failure modes among Invalid samples ---

def compute_invalid_overlap(group):
    invalid_group = group[group['status'] == 'Invalid']
    n_invalid = len(invalid_group)
    if n_invalid == 0:
        return pd.Series({
            'Invalid_with_FN': 0,
            'Invalid_with_WE2_FP': 0,
            'Invalid_with_WE5_FP': 0,
            '% Invalid_with_FN': np.nan,
            '% Invalid_with_WE2_FP': np.nan,
            '% Invalid_with_WE5_FP': np.nan
        })
    return pd.Series({
        'Invalid_with_FN': invalid_group['FN'].sum(),
        'Invalid_with_WE2_FP': invalid_group['WE2_FP'].sum(),
        'Invalid_with_WE5_FP': invalid_group['WE5_FP'].sum(),
        '% Invalid_with_FN': 100 * invalid_group['FN'].sum() / n_invalid,
        '% Invalid_with_WE2_FP': 100 * invalid_group['WE2_FP'].sum() / n_invalid,
        '% Invalid_with_WE5_FP': 100 * invalid_group['WE5_FP'].sum() / n_invalid
    })

invalid_overlap = (
    df_fluA_filtered.groupby('we1_bins')
    .apply(compute_invalid_overlap)
    .reindex(labels)
)

# --- 12️⃣ Combine with main summary ---
summary_df_detailed = pd.concat([summary_df.set_index('bin_range'), invalid_overlap], axis=1).reset_index()

# --- 13️⃣ Round numeric values ---
cols_to_round = [
    '% Invalid_with_FN', '% Invalid_with_WE2_FP', '% Invalid_with_WE5_FP'
]
summary_df_detailed[cols_to_round] = summary_df_detailed[cols_to_round].round(1)

#--------Columns to keep-------------
summary_df_detailed_filtered = summary_df_detailed[['index',
 'count',
 '% Valid',
 '% Invalid',
 '% FN',
 '% WE2 FP',
 '% WE5 FP',
 'Invalid_with_FN',
 'Invalid_with_WE2_FP',
 'Invalid_with_WE5_FP',
 '% Invalid_with_FN',
 '% Invalid_with_WE2_FP',
 '% Invalid_with_WE5_FP'
]]


# --- 14️⃣ Display the enhanced table ---
display(summary_df_detailed_filtered)
index count % Valid % Invalid % FN % WE2 FP % WE5 FP Invalid_with_FN Invalid_with_WE2_FP Invalid_with_WE5_FP % Invalid_with_FN % Invalid_with_WE2_FP % Invalid_with_WE5_FP
0 < -1000 0 NaN NaN NaN NaN NaN 0.0 0.0 0.0 NaN NaN NaN
1 -1000 to -501 0 NaN NaN NaN NaN NaN 0.0 0.0 0.0 NaN NaN NaN
2 -500 to -301 4 100.0 0.0 100.0 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN
3 -300 to -151 68 94.1 5.9 5.9 0.0 0.0 2.0 0.0 0.0 50.0 0.0 0.0
4 -150 to -71 943 92.2 7.8 7.7 0.2 0.6 18.0 0.0 0.0 24.3 0.0 0.0
5 -70 to 0 3966 92.3 7.7 13.6 0.7 0.4 100.0 6.0 4.0 32.6 2.0 1.3
6 > 0 0 NaN NaN NaN NaN NaN 0.0 0.0 0.0 NaN NaN NaN
In [58]:
# --- Deep dive: failure flags that are Invalid ---

def compute_flag_invalid_overlap(group):
    fn_total = group['FN'].sum()
    we2_total = group['WE2_FP'].sum()
    we5_total = group['WE5_FP'].sum()

    fn_invalid = group.loc[group['FN'] & (group['status'] == 'Invalid')].shape[0]
    we2_invalid = group.loc[group['WE2_FP'] & (group['status'] == 'Invalid')].shape[0]
    we5_invalid = group.loc[group['WE5_FP'] & (group['status'] == 'Invalid')].shape[0]

    return pd.Series({
        'FN_total': fn_total,
        'FN_invalid': fn_invalid,
        '% FN_invalid': 100 * fn_invalid / fn_total if fn_total else np.nan,
        'WE2_FP_total': we2_total,
        'WE2_FP_invalid': we2_invalid,
        '% WE2_FP_invalid': 100 * we2_invalid / we2_total if we2_total else np.nan,
        'WE5_FP_total': we5_total,
        'WE5_FP_invalid': we5_invalid,
        '% WE5_FP_invalid': 100 * we5_invalid / we5_total if we5_total else np.nan,
    })

flag_invalid_overlap = (
    df_fluA_filtered.groupby('we1_bins')
    .apply(compute_flag_invalid_overlap)
    .reindex(labels)
)

# --- Combine into your summary ---
summary_df_detailed = pd.concat([summary_df.set_index('bin_range'), flag_invalid_overlap], axis=1).reset_index()

# --- Round percentage columns ---
pct_cols = ['% FN_invalid', '% WE2_FP_invalid', '% WE5_FP_invalid']
summary_df_detailed[pct_cols] = summary_df_detailed[pct_cols].round(1)


#---------Filtering to desired columns------
summary_df_detailed_filtered = summary_df_detailed[['index',
 'count',
 '% Valid',
 '% Invalid',
 '% FN',
 '% WE2 FP',
 '% WE5 FP',
 '% FN_invalid', '% WE2_FP_invalid', '% WE5_FP_invalid'
]]

display(summary_df_detailed_filtered)
index count % Valid % Invalid % FN % WE2 FP % WE5 FP % FN_invalid % WE2_FP_invalid % WE5_FP_invalid
0 < -1000 0 NaN NaN NaN NaN NaN NaN NaN NaN
1 -1000 to -501 0 NaN NaN NaN NaN NaN NaN NaN NaN
2 -500 to -301 4 100.0 0.0 100.0 0.0 0.0 0.0 NaN NaN
3 -300 to -151 68 94.1 5.9 5.9 0.0 0.0 50.0 NaN NaN
4 -150 to -71 943 92.2 7.8 7.7 0.2 0.6 24.7 0.0 0.0
5 -70 to 0 3966 92.3 7.7 13.6 0.7 0.4 18.6 23.1 28.6
6 > 0 0 NaN NaN NaN NaN NaN NaN NaN NaN
In [59]:
import pandas as pd
import numpy as np

# -------------------------------
# 1️⃣ Define continuous thresholds
# -------------------------------
thresholds = list(range(-50, -1025, -25))  # -50, -325, ..., -1000
thresholds.append(-999999)  # catch ≤ -1000
threshold_labels = [str(t) if t != -999999 else "≤ -1000" for t in thresholds]

# -------------------------------
# 2️⃣ Ensure status and FP/FN columns exist
# -------------------------------
def classify_status(row):
    """Return Valid/Invalid based on Warnings/Errors presence."""
    for col in ['Warnings', 'Errors']:
        if col in row and pd.notna(row[col]) and str(row[col]).strip():
            return "Invalid"
    return "Valid"

df_analysis['status'] = df_analysis.apply(classify_status, axis=1)

for flag in ['FN', 'WE2_FP', 'WE5_FP']:
    if flag not in df_analysis.columns:
        df_analysis[flag] = False

# -------------------------------
# 3️⃣ Loop through thresholds
# -------------------------------
results = []

for i, t in enumerate(thresholds):
    label = threshold_labels[i]

    # Subset for this threshold
    if t == -999999:
        subset = df_analysis[df_analysis['we1_step66_tailmean'] <= -1000]
    else:
        subset = df_analysis[df_analysis['we1_step66_tailmean'] <= t]

    if subset.empty:
        continue

    n = len(subset)

    # Valid / Invalid counts
    valid = (subset['status'] == 'Valid').sum()
    invalid = (subset['status'] == 'Invalid').sum()

    # FN / FP totals
    fn_total = subset['FN'].sum()
    we2_total = subset['WE2_FP'].sum()
    we5_total = subset['WE5_FP'].sum()

    # FN / FP also invalid
    fn_invalid = subset.loc[subset['FN'] & (subset['status'] == 'Invalid')].shape[0]
    we2_invalid = subset.loc[subset['WE2_FP'] & (subset['status'] == 'Invalid')].shape[0]
    we5_invalid = subset.loc[subset['WE5_FP'] & (subset['status'] == 'Invalid')].shape[0]

    # Append results
    results.append({
        'Threshold': label,
        'Count': n,
        '% Valid': 100 * valid / n,
        '% Invalid': 100 * invalid / n,
        '% FN': 100 * fn_total / n,
        '% WE2 FP': 100 * we2_total / n,
        '% WE5 FP': 100 * we5_total / n,
        'FN_total': fn_total,
        'FN_invalid': fn_invalid,
        '% FN_invalid': 100 * fn_invalid / fn_total if fn_total else np.nan,
        'WE2_FP_total': we2_total,
        'WE2_FP_invalid': we2_invalid,
        '% WE2_FP_invalid': 100 * we2_invalid / we2_total if we2_total else np.nan,
        'WE5_FP_total': we5_total,
        'WE5_FP_invalid': we5_invalid,
        '% WE5_FP_invalid': 100 * we5_invalid / we5_total if we5_total else np.nan
    })

# -------------------------------
# 4️⃣ Convert to DataFrame
# -------------------------------
threshold_summary_df = pd.DataFrame(results)

# Round percentages
pct_cols = [col for col in threshold_summary_df.columns if col.startswith('%')]
threshold_summary_df[pct_cols] = threshold_summary_df[pct_cols].round(1)

# -------------------------------
# 5️⃣ Optional: filter columns
# -------------------------------
threshold_summary_df_filtered_fluA = threshold_summary_df[[
    'Threshold', 'Count', '% Valid', '% Invalid', '% FN', '% WE2 FP', '% WE5 FP',
    'FN_total', 'FN_invalid', '% FN_invalid',
    'WE2_FP_total', 'WE2_FP_invalid', '% WE2_FP_invalid',
    'WE5_FP_total', 'WE5_FP_invalid', '% WE5_FP_invalid'
]]

# -------------------------------
# 6️⃣ Display
# -------------------------------
display(threshold_summary_df_filtered_fluA)
Threshold Count % Valid % Invalid % FN % WE2 FP % WE5 FP FN_total FN_invalid % FN_invalid WE2_FP_total WE2_FP_invalid % WE2_FP_invalid WE5_FP_total WE5_FP_invalid % WE5_FP_invalid
0 -50 1808 91.8 8.2 9.5 0.4 0.4 172 41 23.8 8 0 0.0 8 0 0.0
1 -75 938 91.6 8.4 7.9 0.2 0.6 74 19 25.7 2 0 0.0 6 0 0.0
2 -100 443 90.1 9.9 8.6 0.0 0.0 38 11 28.9 0 0 NaN 0 0 NaN
3 -125 170 95.9 4.1 8.8 0.0 0.0 15 5 33.3 0 0 NaN 0 0 NaN
4 -150 73 93.2 6.8 12.3 0.0 0.0 9 3 33.3 0 0 NaN 0 0 NaN
5 -175 22 100.0 0.0 27.3 0.0 0.0 6 0 0.0 0 0 NaN 0 0 NaN
6 -200 14 100.0 0.0 42.9 0.0 0.0 6 0 0.0 0 0 NaN 0 0 NaN
7 -225 12 100.0 0.0 50.0 0.0 0.0 6 0 0.0 0 0 NaN 0 0 NaN
8 -250 10 100.0 0.0 60.0 0.0 0.0 6 0 0.0 0 0 NaN 0 0 NaN
9 -275 8 100.0 0.0 75.0 0.0 0.0 6 0 0.0 0 0 NaN 0 0 NaN
10 -300 4 100.0 0.0 100.0 0.0 0.0 4 0 0.0 0 0 NaN 0 0 NaN
11 -325 4 100.0 0.0 100.0 0.0 0.0 4 0 0.0 0 0 NaN 0 0 NaN
In [60]:
import matplotlib.pyplot as plt

# Create the figure and main axis
fig, ax1 = plt.subplots(figsize=(10,6))

# --- Primary y-axis: Percentages ---
ax1.plot(threshold_summary_df_filtered_fluA['Threshold'], threshold_summary_df_filtered_fluA['% Invalid'], marker='o', label='% Invalid')
ax1.plot(threshold_summary_df_filtered_fluA['Threshold'], threshold_summary_df_filtered_fluA['% FN'], marker='o', label='% FN')
ax1.plot(threshold_summary_df_filtered_fluA['Threshold'], threshold_summary_df_filtered_fluA['% WE2 FP'], marker='o', label='% WE2 FP')
ax1.plot(threshold_summary_df_filtered_fluA['Threshold'], threshold_summary_df_filtered_fluA['% WE5 FP'], marker='o', label='% WE5 FP')

ax1.set_xlabel('WE1 Tail Mean Threshold')
ax1.set_ylabel('Percentage (%)', color='black')
ax1.tick_params(axis='y', labelcolor='black')
ax1.set_xticklabels(threshold_summary_df_filtered_fluA['Threshold'], rotation=45)
ax1.grid(True)

# --- Secondary y-axis: Count overlay ---
ax2 = ax1.twinx()
ax2.plot(threshold_summary_df_filtered_fluA['Threshold'], threshold_summary_df_filtered_fluA['Count'],
         color='gray', linestyle='--', marker='x', label='Count')
ax2.set_ylabel('Sample Count', color='gray')
ax2.tick_params(axis='y', labelcolor='gray')

# --- Combine legends ---
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines + lines2, labels + labels2, loc='upper left')

# Title and layout
plt.title('Quality Metrics vs WE1 Tail Mean Threshold')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [61]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# --- Create a copy to avoid modifying the original ---
df = threshold_summary_df_filtered_fluA.copy()

# --- Extract numeric threshold values (e.g. from "≤ -300" → -300.0) ---
def parse_threshold(x):
    if isinstance(x, str):
        x = x.replace('≤', '').strip()
    try:
        return float(x)
    except:
        return np.nan

df['Threshold_num'] = df['Threshold'].apply(parse_threshold)

# --- Sort thresholds numerically (least → most negative) ---
df = df.sort_values(by='Threshold_num', ascending=True)

# --- Compute cumulative percentages ---
df['cum_% Invalid'] = df['% Invalid'].cumsum() / np.arange(1, len(df) + 1)
df['cum_% FN'] = df['% FN'].cumsum() / np.arange(1, len(df) + 1)
df['cum_% WE2 FP'] = df['% WE2 FP'].cumsum() / np.arange(1, len(df) + 1)
df['cum_% WE5 FP'] = df['% WE5 FP'].cumsum() / np.arange(1, len(df) + 1)

# --- Convert Count to % of total ---
df['% Count'] = 100 * df['Count'] / df['Count'].sum()

# --- Create the figure and main axis ---
fig, ax1 = plt.subplots(figsize=(10,6))

# --- Primary y-axis: Cumulative Percentages ---
ax1.plot(df['Threshold_num'], df['cum_% Invalid'], marker='o', label='Cumulative % Invalid')
ax1.plot(df['Threshold_num'], df['cum_% FN'], marker='o', label='Cumulative % FN')
ax1.plot(df['Threshold_num'], df['cum_% WE2 FP'], marker='o', label='Cumulative % WE2 FP')
ax1.plot(df['Threshold_num'], df['cum_% WE5 FP'], marker='o', label='Cumulative % WE5 FP')

ax1.set_xlabel('WE1 Tail Mean Threshold')
ax1.set_ylabel('Cumulative Percentage (%)', color='black')
ax1.tick_params(axis='y', labelcolor='black')
ax1.set_xticks(df['Threshold_num'])
ax1.set_xticklabels(df['Threshold'], rotation=45)
ax1.grid(True)

# --- Reverse the X-axis so least negative (-50) → most negative (-1000) ---
ax1.invert_xaxis()

# --- Secondary y-axis: % Count overlay ---
ax2 = ax1.twinx()
ax2.plot(df['Threshold_num'], df['% Count'], color='gray', linestyle='--', marker='x', label='% of Total Count')
ax2.set_ylabel('Sample % of Total', color='gray')
ax2.tick_params(axis='y', labelcolor='gray')

# --- Combine legends ---
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines + lines2, labels + labels2, loc='upper left')

# --- Title and layout ---
plt.title('Cumulative Quality Metrics vs WE1 Tail Mean Threshold')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [62]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# --- Copy original ---
df = threshold_summary_df_filtered_fluA.copy()

# --- Parse numeric threshold values (handle labels like "≤ -1000") ---
def parse_threshold(x):
    if isinstance(x, str):
        x = x.replace('≤', '').strip()
    try:
        return float(x)
    except:
        return np.nan

df['Threshold_num'] = df['Threshold'].apply(parse_threshold)

# --- Sort thresholds numerically (least → most negative) ---
df = df.sort_values(by='Threshold_num', ascending=False)

# --- Convert Count to % of total ---
df['% Count'] = 100 * df['Count'] / df['Count'].sum()

# --- Compute cumulative sums ---
df['cum_Count'] = df['Count'].cumsum()
df['cum_Invalid'] = (df['% Invalid'] / 100 * df['Count']).cumsum()
df['cum_FN'] = (df['% FN'] / 100 * df['Count']).cumsum()
df['cum_WE2_FP'] = (df['% WE2 FP'] / 100 * df['Count']).cumsum()
df['cum_WE5_FP'] = (df['% WE5 FP'] / 100 * df['Count']).cumsum()

# --- Convert cumulative sums to percentages of total Count ---
total_count = df['Count'].sum()
df['cum_% Invalid'] = 100 * df['cum_Invalid'] / total_count
df['cum_% FN'] = 100 * df['cum_FN'] / total_count
df['cum_% WE2 FP'] = 100 * df['cum_WE2_FP'] / total_count
df['cum_% WE5 FP'] = 100 * df['cum_WE5_FP'] / total_count

# --- Plot ---
fig, ax1 = plt.subplots(figsize=(10,6))

# --- Primary axis: cumulative metrics ---
ax1.plot(df['Threshold_num'], df['cum_% Invalid'], marker='o', label='Cumulative % Invalid')
ax1.plot(df['Threshold_num'], df['cum_% FN'], marker='o', label='Cumulative % FN')
ax1.plot(df['Threshold_num'], df['cum_% WE2 FP'], marker='o', label='Cumulative % WE2 FP')
ax1.plot(df['Threshold_num'], df['cum_% WE5 FP'], marker='o', label='Cumulative % WE5 FP')

ax1.set_xlabel('WE1 Tail Mean Threshold')
ax1.set_ylabel('Cumulative Percentage of Total Samples (%)', color='black')
ax1.tick_params(axis='y', labelcolor='black')
ax1.set_xticks(df['Threshold_num'])
ax1.set_xticklabels(df['Threshold'], rotation=45)
ax1.grid(True)

# --- Reverse x-axis (least negative → most negative) ---
ax1.invert_xaxis()

# --- Secondary axis: % Count per threshold ---
ax2 = ax1.twinx()
ax2.plot(df['Threshold_num'], df['% Count'], color='gray', linestyle='--', marker='x', label='% of Total Count (per bin)')
ax2.set_ylabel('Sample % of Total (bin-level)', color='gray')
ax2.tick_params(axis='y', labelcolor='gray')

# --- Combine legends ---
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines + lines2, labels + labels2, loc='upper left')

plt.title('Cumulative Quality Metrics vs WE1 Tail Mean Threshold')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Copy source
df = threshold_summary_df_filtered_fluA.copy()

# Parse numeric thresholds (handle "≤ -1000" etc.)
def parse_threshold(x):
    if isinstance(x, str):
        x = x.replace('≤', '').strip()
    try:
        return float(x)
    except:
        return np.nan

df['Threshold_num'] = df['Threshold'].apply(parse_threshold)

# --- Ensure we only use rows with valid numeric threshold ---
df = df.dropna(subset=['Threshold_num']).copy()

# Sort in the natural accumulation order: least negative -> most negative
df = df.sort_values('Threshold_num', ascending=False).reset_index(drop=True)

# Ensure numeric columns
for col in ['Count', '% FN', '% Invalid', '% WE2 FP', '% WE5 FP']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

# Per-bin absolute counts of events (from percent and count)
df['FN_count']     = (df['% FN'] / 100.0) * df['Count']
df['Invalid_count']= (df['% Invalid'] / 100.0) * df['Count']
df['WE2FP_count']  = (df['% WE2 FP'] / 100.0) * df['Count']
df['WE5FP_count']  = (df['% WE5 FP'] / 100.0) * df['Count']

# Running-average style (not usually desired for accumulation)
df['running_avg_%FN'] = df['% FN'].cumsum() / np.arange(1, len(df) + 1)

# True cumulative totals (absolute) and convert to percent of total samples
total_samples = df['Count'].sum() if df['Count'].sum() else 1
df['cum_FN_count'] = df['FN_count'].cumsum()
df['cum_%FN_total'] = 100.0 * df['cum_FN_count'] / total_samples

# Also cum for Invalids / FPs
df['cum_Invalid_count'] = df['Invalid_count'].cumsum()
df['cum_%Invalid_total'] = 100.0 * df['cum_Invalid_count'] / total_samples
df['cum_WE2FP_count'] = df['WE2FP_count'].cumsum()
df['cum_%WE2FP_total'] = 100.0 * df['cum_WE2FP_count'] / total_samples
df['cum_WE5FP_count'] = df['WE5FP_count'].cumsum()
df['cum_%WE5FP_total'] = 100.0 * df['cum_WE5FP_count'] / total_samples

# Diagnostic: check monotonicity (should be non-decreasing)
def is_monotonic_nondec(series):
    return (np.diff(series) >= -1e-8).all()

monotonic_fn = is_monotonic_nondec(df['cum_%FN_total'].values)
if not monotonic_fn:
    print("WARNING: cum_%FN_total is not monotonic non-decreasing. Inspect 'FN_count' and ordering.")

# Add % Total Samples column (aka % Count)
df['% Count'] = 100.0 * df['Count'] / total_samples

# Add Highlight column for rows where % Count ~ 5% (±0.5%)
tolerance = 0.5
df['Highlight_5pct'] = df['% Count'].apply(lambda x: '<< 5% mark' if abs(x - 5.0) <= tolerance else '')

# --- Plot both versions for comparison ---
fig, ax = plt.subplots(figsize=(11,6))

# True cumulative proportions (preferred)
ax.plot(df['Threshold_num'], df['cum_%Invalid_total'], marker='o', label='True cum % Invalid (of total samples)')
ax.plot(df['Threshold_num'], df['cum_%FN_total'], marker='o', label='True cum % FN (of total samples)')
ax.plot(df['Threshold_num'], df['cum_%WE2FP_total'], marker='o', label='True cum % WE2 FP (of total samples)')
ax.plot(df['Threshold_num'], df['cum_%WE5FP_total'], marker='o', label='True cum % WE5 FP (of total samples)')

# Running-average lines (for reference)
ax.plot(df['Threshold_num'], df['running_avg_%FN'], marker='x', linestyle='--', label='Running avg %FN (not cumulative)')

# X axis labels
ax.set_xticks(df['Threshold_num'])
ax.set_xticklabels(df['Threshold'], rotation=45)
ax.invert_xaxis()  # show least negative -> most negative left->right
ax.set_xlabel('WE1 Tail Mean Threshold')
ax.set_ylabel('Percentage (%)')
ax.grid(True)
ax.legend(loc='upper left')

# Right axis: percent of samples per threshold (bin-level)
ax2 = ax.twinx()
ax2.plot(df['Threshold_num'], df['% Count'], color='gray', linestyle='--', marker='x', label='% of total (per threshold)')
ax2.set_ylabel('% of total samples (per threshold)', color='gray')
ax2.tick_params(axis='y', labelcolor='gray')

# Show plot
plt.title('True cumulative vs running-average %FN (and other metrics)')
plt.tight_layout()
plt.show()

# Optional: print a small table for debugging with new columns
display_cols = ['Threshold', 'Count', '% Count', '% FN', 'FN_count', 'cum_FN_count', 'cum_%FN_total', 'Highlight_5pct']
print(df[display_cols].head(12).to_string(index=False))
No description has been provided for this image
Threshold  Count   % Count  % FN  FN_count  cum_FN_count  cum_%FN_total Highlight_5pct
      -50   1808 51.568739   9.5   171.760       171.760       4.899030               
      -75    938 26.754136   7.9    74.102       245.862       7.012607               
     -100    443 12.635482   8.6    38.098       283.960       8.099258               
     -125    170  4.848831   8.8    14.960       298.920       8.525956     << 5% mark
     -150     73  2.082145  12.3     8.979       307.899       8.782059               
     -175     22  0.627496  27.3     6.006       313.905       8.953366               
     -200     14  0.399315  42.9     6.006       319.911       9.124672               
     -225     12  0.342270  50.0     6.000       325.911       9.295807               
     -250     10  0.285225  60.0     6.000       331.911       9.466942               
     -275      8  0.228180  75.0     6.000       337.911       9.638078               
     -300      4  0.114090 100.0     4.000       341.911       9.752168               
     -325      4  0.114090 100.0     4.000       345.911       9.866258               
In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from kneed import KneeLocator  # You'll need to install this package via pip

def analyze_we1_cutoff_tradeoff(threshold_df, threshold_col='Threshold', count_col='Count', fn_percent_col='% FN'):
    """
    Analyze tradeoff between sample exclusion and FN exclusion by WE1 threshold.
    Detect knee point to recommend cutoff.

    Parameters:
    - threshold_df: pd.DataFrame with threshold summary
    - threshold_col: column name with threshold labels
    - count_col: column with sample counts per threshold bin
    - fn_percent_col: column with % FN per threshold bin

    Returns:
    - recommended_cutoff: numeric threshold value at knee point
    - plot figure with annotated tradeoff curve
    - augmented DataFrame with cumulative stats and knee point info
    """
    df = threshold_summary_df_filtered_fluA.copy()

    # Parse numeric threshold (handle "≤ -1000" etc.)
    def parse_threshold(x):
        if isinstance(x, str):
            x = x.replace('≤', '').strip()
        try:
            return float(x)
        except:
            return np.nan

    df['Threshold_num'] = df[threshold_col].apply(parse_threshold)
    df = df.dropna(subset=['Threshold_num']).copy()

    # Sort descending (least negative -> most negative)
    df = df.sort_values('Threshold_num', ascending=False).reset_index(drop=True)

    # Ensure numeric columns
    for col in [count_col, fn_percent_col]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    total_samples = df[count_col].sum()
    total_fn = ((df[fn_percent_col] / 100) * df[count_col]).sum()

    # Calculate FN counts and cumulative sums (reversed for exclusion)
    df['FN_count'] = (df[fn_percent_col] / 100) * df[count_col]

    # Reverse cumulative sums (samples excluded and FN excluded if cutoff at each threshold)
    df['samples_excluded'] = df[count_col][::-1].cumsum()[::-1]
    df['fn_excluded'] = df['FN_count'][::-1].cumsum()[::-1]

    # Percents of totals
    df['% samples_excluded'] = 100 * df['samples_excluded'] / total_samples
    df['% fn_excluded'] = 100 * df['fn_excluded'] / total_fn

    # Detect knee/elbow point
    knee = KneeLocator(
        df['% samples_excluded'], df['% fn_excluded'],
        curve='convex', direction='increasing', S=2
    )
    knee_point = knee.knee

    # Get threshold value at knee if found
    if knee_point is not None:
        recommended_cutoff = df.loc[(df['% samples_excluded'] - knee_point).abs().idxmin(), 'Threshold_num']
    else:
        recommended_cutoff = None

    # Plot tradeoff curve with knee point annotated
    plt.figure(figsize=(9,6))
    plt.plot(df['% samples_excluded'], df['% fn_excluded'], marker='o', label='FN Excluded vs Samples Excluded')
    if knee_point is not None:
        plt.axvline(knee_point, color='red', linestyle='--', label=f'Knee at {knee_point:.2f}% samples excluded')
        knee_y = df.loc[(df['% samples_excluded'] - knee_point).abs().idxmin(), '% fn_excluded']
        plt.scatter(knee_point, knee_y, color='red')
        plt.annotate(f"Threshold: {recommended_cutoff}\nFN Excluded: {knee_y:.1f}%",
                     xy=(knee_point, knee_y), xytext=(knee_point+2, knee_y-5),
                     arrowprops=dict(facecolor='black', shrink=0.05))
    plt.xlabel('% Samples Excluded (WE1 < cutoff)')
    plt.ylabel('% FN Excluded (caught)')
    plt.title('Tradeoff Curve: % FN Excluded vs % Samples Excluded')
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

    return recommended_cutoff, df


recommended_cutoff, augmented_df = analyze_we1_cutoff_tradeoff(threshold_summary_df_filtered_fluA)
print(f"Recommended WE1 threshold cutoff: {recommended_cutoff}")
display(augmented_df.head(10))
No description has been provided for this image
Recommended WE1 threshold cutoff: -200.0
Threshold Count % Valid % Invalid % FN % WE2 FP % WE5 FP FN_total FN_invalid % FN_invalid ... % WE2_FP_invalid WE5_FP_total WE5_FP_invalid % WE5_FP_invalid Threshold_num FN_count samples_excluded fn_excluded % samples_excluded % fn_excluded
0 -50 1808 91.8 8.2 9.5 0.4 0.4 172 41 23.8 ... 0.0 8 0 0.0 -50.0 171.760 3506 345.911 100.000000 100.000000
1 -75 938 91.6 8.4 7.9 0.2 0.6 74 19 25.7 ... 0.0 6 0 0.0 -75.0 74.102 1698 174.151 48.431261 50.345609
2 -100 443 90.1 9.9 8.6 0.0 0.0 38 11 28.9 ... NaN 0 0 NaN -100.0 38.098 760 100.049 21.677125 28.923336
3 -125 170 95.9 4.1 8.8 0.0 0.0 15 5 33.3 ... NaN 0 0 NaN -125.0 14.960 317 61.951 9.041643 17.909520
4 -150 73 93.2 6.8 12.3 0.0 0.0 9 3 33.3 ... NaN 0 0 NaN -150.0 8.979 147 46.991 4.192812 13.584708
5 -175 22 100.0 0.0 27.3 0.0 0.0 6 0 0.0 ... NaN 0 0 NaN -175.0 6.006 74 38.012 2.110667 10.988954
6 -200 14 100.0 0.0 42.9 0.0 0.0 6 0 0.0 ... NaN 0 0 NaN -200.0 6.006 52 32.006 1.483172 9.252669
7 -225 12 100.0 0.0 50.0 0.0 0.0 6 0 0.0 ... NaN 0 0 NaN -225.0 6.000 38 26.000 1.083856 7.516384
8 -250 10 100.0 0.0 60.0 0.0 0.0 6 0 0.0 ... NaN 0 0 NaN -250.0 6.000 26 20.000 0.741586 5.781834
9 -275 8 100.0 0.0 75.0 0.0 0.0 6 0 0.0 ... NaN 0 0 NaN -275.0 6.000 16 14.000 0.456361 4.047284

10 rows × 22 columns

In [65]:
import matplotlib.pyplot as plt

# --- Plot: Stacked bar of outcome percentages per bin ---
fig, ax = plt.subplots(figsize=(10, 6))

# Define category columns to plot
plot_cols = ['% Invalid', '% FN', '% WE2 FP', '% WE5 FP']
colors = ['#4CAF50', '#E74C3C', '#3498DB', '#F1C40F', '#9B59B6']  # nice clean palette

bottom = np.zeros(len(summary_df))
for col, color in zip(plot_cols, colors):
    ax.bar(summary_df['bin_range'], summary_df[col], bottom=bottom, label=col, color=color)
    bottom += summary_df[col].values

# Aesthetics
ax.set_title("Test Validity and Flag Breakdown per WE1 Bin", fontsize=14, pad=12)
ax.set_xlabel("WE1 Bin Range", fontsize=12)
ax.set_ylabel("Percentage (%)", fontsize=12)
ax.set_ylim(0, 100)
ax.legend(title="Status", bbox_to_anchor=(1.05, 1), loc='upper left')
ax.grid(axis='y', linestyle='--', alpha=0.5)
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [66]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# --- CONFIG: use the actual raw dataframe and correct WE1 column name ---
df_raw = df_fluA_filtered.copy()   # your raw FluA dataframe
value_col = 'we1_step66_tailmean'  # correct column used elsewhere in our convo
fn_col = 'FN'                      # binary 0/1 column indicating False Negative

# --- define bin edges you used before (adjust if you want different cut points) ---
bin_edges = [-np.inf, -325, -300, -275, -250, -225, -200, -175, -150, -125, -100, -75, -50, np.inf]
# Make readable labels (we'll display the left-inclusive upper edge representation)
bin_labels = ['<-325', '-325 to -301', '-300 to -276', '-275 to -251', '-250 to -226',
              '-225 to -201', '-200 to -176', '-175 to -151', '-150 to -126', '-125 to -101',
              '-100 to -76', '-75 to -51', '-50 to 0_or_more']

# --- Quick diagnostics: existence & nulls in the WE1 column ---
print("Raw summary:")
print(f"Total rows in df_raw: {len(df_raw)}")
print(f"Column '{value_col}' present? {value_col in df_raw.columns}")
print(f"Non-null values in '{value_col}': {df_raw[value_col].notna().sum()}")
print(f"FN total from raw data (sum of `{fn_col}`): {df_raw[fn_col].sum()}\n")

# --- BINNING (defensive) ---
df_raw['_tmp_bin'] = pd.cut(df_raw[value_col], bins=bin_edges, labels=bin_labels, right=True)

# Show how many rows ended up in bins vs how many were dropped (NaN)
bin_counts_with_nan = df_raw['_tmp_bin'].value_counts(dropna=False).to_frame(name='count')
print("Counts per bin (including NaN):")
print(bin_counts_with_nan)
print("\nNumber of rows with NaN bin (not placed into any bin):", df_raw['_tmp_bin'].isna().sum())

# --- AGGREGATE BY BIN (only non-NaN bins) ---
df_summary = (
    df_raw.dropna(subset=['_tmp_bin'])
    .groupby('_tmp_bin', observed=True)
    .agg(
        Count=('FN', 'size'),
        FN_count=(fn_col, 'sum')
    )
    .reset_index()
)

# Reindex to ensure all bins present in order
df_summary = df_summary.set_index('_tmp_bin').reindex(bin_labels).reset_index()

# --- Compute percentages & cumulative ---
df_summary['% FN'] = 100.0 * df_summary['FN_count'] / df_summary['Count']
df_summary['% FN'] = df_summary['% FN'].fillna(0)  # avoid NaN when Count=0
df_summary['Count'] = df_summary['Count'].fillna(0).astype(int)
df_summary['FN_count'] = df_summary['FN_count'].fillna(0).astype(int)

# --- SORT bins from least negative to most negative ---
df_summary_sorted = df_summary.iloc[::-1].reset_index(drop=True)

# Recalculate cumulative columns in forward order
df_summary_sorted['cum_FN_count'] = df_summary_sorted['FN_count'].cumsum()
total_samples_in_bins = df_summary_sorted['Count'].sum()
df_summary_sorted['cum_%FN_total'] = 100.0 * df_summary_sorted['cum_FN_count'] / total_samples_in_bins if total_samples_in_bins else 0.0
df_summary_sorted['% Count'] = 100.0 * df_summary_sorted['Count'] / total_samples_in_bins if total_samples_in_bins else 0.0
df_summary_sorted['cum_%Samples'] = df_summary_sorted['% Count'].cumsum()

# --- Diagnostics: compare sums to raw data ---
total_fn_raw = int(df_raw[fn_col].sum())
total_fn_binned = int(df_summary['FN_count'].sum())
print(f"\nTotal FN in raw data: {total_fn_raw}")
print(f"Total FN in binned (non-NaN) data: {total_fn_binned}")
print(f"Total rows in raw: {len(df_raw)}, total rows placed in bins: {int(df_raw['_tmp_bin'].notna().sum())}")

# --- Display sorted summary table ---
display_cols = ['_tmp_bin', 'Count', '% Count', '% FN', 'FN_count', 'cum_FN_count', 'cum_%FN_total']
print("\nBinned summary (sorted least negative → most negative):")
print(df_summary_sorted[display_cols].to_string(index=False))

# --- Plot cumulative FN and samples excluded ---
fig, ax1 = plt.subplots(figsize=(11,6))

color_fn = 'tab:blue'
color_sample = 'tab:gray'

ax1.set_xlabel('WE1 Tail Mean Threshold Bins (least negative → most negative)')
ax1.set_ylabel('Cumulative % FN (of samples)', color=color_fn)
ax1.plot(df_summary_sorted['_tmp_bin'], df_summary_sorted['cum_%FN_total'], marker='o', color=color_fn, label='Cumulative % FN')
ax1.tick_params(axis='y', labelcolor=color_fn)
ax1.set_xticklabels(df_summary_sorted['_tmp_bin'], rotation=45, ha='right')

ax2 = ax1.twinx()
ax2.set_ylabel('Cumulative % Samples (excluded)', color=color_sample)
ax2.plot(df_summary_sorted['_tmp_bin'], df_summary_sorted['cum_%Samples'], marker='x', linestyle='--', color=color_sample, label='Cumulative % Samples')
ax2.tick_params(axis='y', labelcolor=color_sample)

fig.suptitle('Cumulative % FN vs % Samples Excluded by WE1 Tail Mean Threshold Bins')
fig.tight_layout()
plt.show()

# --- Optional: print examples for any NaN bins ---
nan_mask = df_raw['_tmp_bin'].isna()
if nan_mask.any():
    print("\nExamples (first 10) of rows where bin is NaN (shows WE1 value):")
    print(df_raw.loc[nan_mask, [value_col, fn_col]].head(10).to_string(index=False))
else:
    print("\nAll rows were placed into bins successfully (no NaNs).")

# --- Clean up temporary column ---
df_raw.drop(columns=['_tmp_bin'], inplace=True)
Raw summary:
Total rows in df_raw: 4981
Column 'we1_step66_tailmean' present? True
Non-null values in 'we1_step66_tailmean': 4981
FN total from raw data (sum of `FN`): 619

Counts per bin (including NaN):
                  count
_tmp_bin               
-50 to 0_or_more   3248
-75 to -51          839
-100 to -76         471
-125 to -101        256
-150 to -126         95
-175 to -151         50
-200 to -176          8
<-325                 4
-300 to -276          4
-275 to -251          2
-250 to -226          2
-225 to -201          2
-325 to -301          0

Number of rows with NaN bin (not placed into any bin): 0

Total FN in raw data: 619
Total FN in binned (non-NaN) data: 619
Total rows in raw: 4981, total rows placed in bins: 4981

Binned summary (sorted least negative → most negative):
        _tmp_bin  Count   % Count       % FN  FN_count  cum_FN_count  cum_%FN_total
-50 to 0_or_more   3248 65.207790  13.885468       451           451       9.054407
      -75 to -51    839 16.844007  11.442193        96           547      10.981731
     -100 to -76    471  9.455933   7.643312        36           583      11.704477
    -125 to -101    256  5.139530   8.593750        22           605      12.146155
    -150 to -126     95  1.907248   6.315789         6           611      12.266613
    -175 to -151     50  1.003814   4.000000         2           613      12.306766
    -200 to -176      8  0.160610   0.000000         0           613      12.306766
    -225 to -201      2  0.040153   0.000000         0           613      12.306766
    -250 to -226      2  0.040153   0.000000         0           613      12.306766
    -275 to -251      2  0.040153   0.000000         0           613      12.306766
    -300 to -276      4  0.080305  50.000000         2           615      12.346918
    -325 to -301      0  0.000000   0.000000         0           615      12.346918
           <-325      4  0.080305 100.000000         4           619      12.427223
No description has been provided for this image
All rows were placed into bins successfully (no NaNs).
In [67]:
print(df_fluA_filtered.shape)
print(df_fluA_filtered['FN'].value_counts())
(4981, 172)
FN
0    4362
1     619
Name: count, dtype: int64

FluB WE1 PreCheck Analysis¶

In [68]:
df_fluB.shape
Out[68]:
(11121, 170)
In [69]:
# Standardizing 'Date Tested' format

df_fluB['Date Tested'] = pd.to_datetime(df_fluB['Date Tested'], format='mixed')

# Standardizing "Reader SN" strings

df_fluB['Reader SN'] = df_fluB['Reader SN'].astype(str)
df_fluB['Reader SN'] = df_fluB['Reader SN'].str.strip()
df_fluB['Reader SN'] = df_fluB['Reader SN'].str.replace(r'\.0$', '', regex=True)

# Keeping only 745 MUX (666 Flu+CoV PID, modified RV flow check), FABC-1.0.0, and FABC-005-13Aug25

df_fluB = df_fluB[df_fluB['Procedure'].isin(['745 MUX', 'FABC-1.0.0', 'FABC-005-13Aug25'])]
In [70]:
# drop duplicated

df_fluB_filtered = df_fluB.drop_duplicates()

df_fluB_filtered.shape
Out[70]:
(3436, 170)
In [71]:
# Building df_analysis_fluB forlater analysis work

# Use a clean copy of the data
df_analysis_fluB = df_fluB.copy()  # or df_cov_filtered, depending on the dataset

# Ensure the column exists and drop NA only for the analysis
df_analysis_fluB = df_analysis_fluB[df_analysis_fluB['we1_step66_tailmean'].notna()]
In [72]:
# Visualizing 'we1_step66_tailmean values as a function of WE4 values

plt.figure(figsize=(10, 6))
plt.scatter(df_fluB['WE2'], df_fluB['we1_step66_tailmean'], alpha=0.5)
plt.xlabel('WE2')
plt.ylabel('we1_step66_tailmean')
plt.title('Scatter Plot of we1_step66_tailmean vs. WE2')
# horizontal line at -1000
plt.axhline(y=-1000, color='r', linestyle='--')
# vertical line at -3000
plt.axvline(x=-3000, color='g', linestyle='--')
plt.grid(True)
plt.show()
No description has been provided for this image
In [73]:
# Binning the WE1 precheck data

col = "we1_step66_tailmean"
data = df_fluB_filtered[col].dropna()

# Define your bins and labels
bins = [float("-inf"), -1000, -500, -300, -150, -70, 0, float("inf")]
labels = [
    "< -1000",
    "-1000 to -501",
    "-500 to -301",
    "-300 to -151",
    "-150 to -71",
    "-70 to 0",
    "> 0"
]

# Apply binning
binned = pd.cut(data, bins=bins, labels=labels, include_lowest=True)
summary = binned.value_counts().sort_index().reset_index()
summary.columns = ["bin_range", "count"]

# Add percentages
summary["percent"] = (summary["count"] / summary["count"].sum() * 100).round(2)

print(summary)
       bin_range  count  percent
0        < -1000      1     0.03
1  -1000 to -501      2     0.06
2   -500 to -301      0     0.00
3   -300 to -151     39     1.14
4    -150 to -71    657    19.12
5       -70 to 0   2737    79.66
6            > 0      0     0.00
In [74]:
# Visualizing the bins

plt.figure(figsize=(8,4))
plt.bar(summary["bin_range"], summary["percent"])
plt.title(f"Distribution of {col}")
plt.xlabel("Value Range")
plt.ylabel("Percent of total")
plt.xticks(rotation=30)
# Annotating percentage above each bar
for i, v in enumerate(summary["percent"]):
    plt.text(i, v, f"{v}%", ha="center", va="bottom")
plt.show()
No description has been provided for this image
In [75]:
# Looking at WE1, WE2, WE3, WE4, WE5, and WE6 values of my individual bins

# Add the binned data as a new column to df_fluB_filtered
df_fluB_filtered['we1_bins'] = binned

# Group by the created bins and describe the WE columns
we_analysis = df_fluB_filtered.groupby('we1_bins')[['WE1', 'WE2', 'WE3', 'WE4', 'WE5', 'WE6']].describe()
display(we_analysis)
WE1 WE2 ... WE5 WE6
count mean std min 25% 50% 75% max count mean ... 75% max count mean std min 25% 50% 75% max
we1_bins
< -1000 1.0 -1541.000000 NaN -1541.0 -1541.00 -1541.0 -1541.00 -1541.0 1.0 -509.000000 ... -19.0 -19.0 1.0 -10964.000000 NaN -10964.0 -10964.00 -10964.0 -10964.00 -10964.0
-1000 to -501 2.0 -11806.500000 15329.367909 -22646.0 -17226.25 -11806.5 -6386.75 -967.0 2.0 -1748.000000 ... -192.0 -168.0 2.0 -6113.500000 8080.109189 -11827.0 -8970.25 -6113.5 -3256.75 -400.0
-500 to -301 0.0 NaN NaN NaN NaN NaN NaN NaN 0.0 NaN ... NaN NaN 0.0 NaN NaN NaN NaN NaN NaN NaN
-300 to -151 39.0 -14690.000000 7422.432666 -31324.0 -18279.00 -14192.0 -10514.00 -212.0 39.0 -10804.128205 ... -96.0 -63.0 39.0 -15217.717949 4089.536359 -21757.0 -18572.00 -15328.0 -12270.50 -3626.0
-150 to -71 657.0 -15167.610350 4435.737040 -28921.0 -17781.00 -14921.0 -12690.00 -81.0 657.0 -11433.709285 ... -88.0 -27.0 657.0 -16030.412481 3621.157492 -28912.0 -18295.00 -16401.0 -14149.00 0.0
-70 to 0 2737.0 -17739.273292 5504.144611 -34711.0 -21224.00 -18138.0 -14937.00 0.0 2737.0 -11519.077822 ... -73.0 0.0 2737.0 -15529.258677 3418.808676 -26828.0 -17948.00 -15576.0 -13302.00 0.0
> 0 0.0 NaN NaN NaN NaN NaN NaN NaN 0.0 NaN ... NaN NaN 0.0 NaN NaN NaN NaN NaN NaN NaN

7 rows × 48 columns

In [76]:
import matplotlib.pyplot as plt
import numpy as np

we_means = df_fluB_filtered.groupby('we1_bins')[['WE1', 'WE2', 'WE3', 'WE4', 'WE5', 'WE6']].mean()
we_means = we_means.reindex(labels)  # ensure correct bin order

bins = we_means.index
we_cols = we_means.columns
x = np.arange(len(bins))  # the label locations
width = 0.13  # width of each bar

plt.figure(figsize=(10,5))
for i, col in enumerate(we_cols):
    plt.bar(x + i*width, we_means[col], width=width, label=col)

plt.xticks(x + width*2.5, bins, rotation=30)
plt.ylabel("Mean WE# Value")
plt.xlabel("WE1 Bins")
plt.title("Mean WE# Values per Bin")
plt.legend(title="WE#")
plt.legend(bbox_to_anchor=(1.11, 1), loc='upper right')
plt.hlines(y=-3000, xmin=x.min(), xmax=x.max(), linestyles='dashed')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [77]:
import seaborn as sns

plt.figure(figsize=(8,6))
sns.heatmap(we_means, annot=True, fmt=".1f", cmap="coolwarm", cbar_kws={'label': 'Mean Value'})
plt.title("Mean WE# Values by we1_bin")
plt.ylabel("WE1 Bins")
plt.xlabel("WE#")
plt.xticks(rotation=0)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [78]:
import pandas as pd

# 1️⃣ Compute mean WE values per bin
we_means = df_fluB_filtered.groupby('we1_bins')[['WE1', 'WE2', 'WE3', 'WE4', 'WE5', 'WE6']].mean()

# Ensure bins are in the correct order
we_means = we_means.reindex(labels)

# 2️⃣ Count samples per bin
bin_counts = df_fluB_filtered['we1_bins'].value_counts().reindex(labels)
bin_counts.name = "count"

# 3️⃣ Identify the control bin
control_bin = "-70 to 0"
control_values = we_means.loc[control_bin]

# 4️⃣ Compute Delta_WE# relative to control
delta_cols = {col: f"Delta_{col}" for col in we_means.columns}
delta_values = we_means - control_values
delta_values.rename(columns=delta_cols, inplace=True)

# 5️⃣ Combine counts, means, and deltas into one summary DataFrame
summary_df = pd.concat([bin_counts, we_means, delta_values], axis=1).reset_index()
summary_df.rename(columns={"we1_bins": "bin_range"}, inplace=True)

# 6️⃣ Format numeric columns to 1 decimal place
we_cols = ['WE1','WE2','WE3','WE4','WE5','WE6']
delta_cols = [f"Delta_{col}" for col in we_cols]
summary_df[we_cols + delta_cols] = summary_df[we_cols + delta_cols].round(1)

# 7️⃣ Display the table
display(summary_df)
bin_range count WE1 WE2 WE3 WE4 WE5 WE6 Delta_WE1 Delta_WE2 Delta_WE3 Delta_WE4 Delta_WE5 Delta_WE6
0 < -1000 1 -1541.0 -509.0 -420.0 -173.0 -19.0 -10964.0 16198.3 11010.1 -136.4 -46.9 100.1 4565.3
1 -1000 to -501 2 -11806.5 -1748.0 -446.0 -182.5 -216.0 -6113.5 5932.8 9771.1 -162.4 -56.4 -96.9 9415.8
2 -500 to -301 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 -300 to -151 39 -14690.0 -10804.1 -278.2 -167.2 -154.2 -15217.7 3049.3 714.9 5.3 -41.1 -35.1 311.5
4 -150 to -71 657 -15167.6 -11433.7 -262.1 -139.4 -143.1 -16030.4 2571.7 85.4 21.5 -13.4 -24.1 -501.2
5 -70 to 0 2737 -17739.3 -11519.1 -283.6 -126.1 -119.1 -15529.3 0.0 0.0 0.0 0.0 0.0 0.0
6 > 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [79]:
# Plot the Delta WE's

delta_only = summary_df.set_index('bin_range')[delta_cols]

plt.figure(figsize=(8,6))
sns.heatmap(delta_only, annot=True, fmt=".1f", cmap="coolwarm", cbar_kws={'label': 'Delta Value'})
plt.title("Delta WE# Values Relative to Control Bin (-70 to 0)")
plt.ylabel("WE1 Bins")
plt.xlabel("WE#")
plt.xticks(rotation=0)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [80]:
import pandas as pd
import numpy as np

# --- 1️⃣ Compute mean WE values per bin ---
we_means = df_fluB_filtered.groupby('we1_bins')[['WE1', 'WE2', 'WE3', 'WE4', 'WE5', 'WE6']].mean()
we_means = we_means.reindex(labels)

# --- 2️⃣ Count samples per bin ---
bin_counts = df_fluB_filtered['we1_bins'].value_counts().reindex(labels)
bin_counts.name = "count"

# --- 3️⃣ Identify control bin ---
control_bin = "-70 to 0"
control_values = we_means.loc[control_bin]

# --- 4️⃣ Compute ΔWE relative to control ---
delta_cols = {col: f"Delta_{col}" for col in we_means.columns}
delta_values = we_means - control_values
delta_values.rename(columns=delta_cols, inplace=True)

# --- 5️⃣ Cartridge-level status determination ---
def classify_status(row):
    """Return Valid/Invalid based on warnings/errors presence."""
    has_issue = False
    for col in ['Warnings', 'Errors']:
        if col in row and pd.notna(row[col]) and str(row[col]).strip():
            has_issue = True
            break
    return "Invalid" if has_issue else "Valid"

df_fluB_filtered['status'] = df_fluB_filtered.apply(classify_status, axis=1)

# --- 6️⃣ Define helper flags (FN, WE3_FP, WE5_FP) ---
# Assuming these flags are boolean or identifiable by name in columns:
for flag in ['FN', 'WE3_FP', 'WE5_FP']:
    if flag not in df_fluB_filtered.columns:
        df_fluB_filtered[flag] = False  # ensure column exists

# --- 7️⃣ Compute percentages per bin ---
def compute_percentages(group):
    n = len(group)
    valid = (group['status'] == 'Valid').sum()
    invalid = (group['status'] == 'Invalid').sum()
    fn = group['FN'].sum()
    we3_fp = group['WE3_FP'].sum()
    we5_fp = group['WE5_FP'].sum()
    return pd.Series({
        '% Valid': 100 * valid / n if n else np.nan,
        '% Invalid': 100 * invalid / n if n else np.nan,
        '% FN': 100 * fn / n if n else np.nan,
        '% WE3 FP': 100 * we3_fp / n if n else np.nan,
        '% WE5 FP': 100 * we5_fp / n if n else np.nan
    })

percentages = df_fluB_filtered.groupby('we1_bins').apply(compute_percentages).reindex(labels)

# --- 8️⃣ Combine all results into one summary table ---
summary_df = pd.concat([bin_counts, we_means, delta_values, percentages], axis=1).reset_index()
summary_df.rename(columns={"we1_bins": "bin_range"}, inplace=True)

# --- 9️⃣ Format numeric columns to 1 decimal place ---
we_cols = ['WE1','WE2','WE3','WE4','WE5','WE6']
delta_cols = [f"Delta_{col}" for col in we_cols]
summary_df[we_cols + delta_cols] = summary_df[we_cols + delta_cols].round(1)
summary_df[['% Valid', '% Invalid', '% FN', '% WE3 FP', '% WE5 FP']] = \
    summary_df[['% Valid', '% Invalid', '% FN', '% WE3 FP', '% WE5 FP']].round(1)

# --- 🔟 Display the final summary ---
display(summary_df)
bin_range count WE1 WE2 WE3 WE4 WE5 WE6 Delta_WE1 Delta_WE2 Delta_WE3 Delta_WE4 Delta_WE5 Delta_WE6 % Valid % Invalid % FN % WE3 FP % WE5 FP
0 < -1000 1 -1541.0 -509.0 -420.0 -173.0 -19.0 -10964.0 16198.3 11010.1 -136.4 -46.9 100.1 4565.3 0.0 100.0 100.0 0.0 0.0
1 -1000 to -501 2 -11806.5 -1748.0 -446.0 -182.5 -216.0 -6113.5 5932.8 9771.1 -162.4 -56.4 -96.9 9415.8 50.0 50.0 100.0 0.0 0.0
2 -500 to -301 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 -300 to -151 39 -14690.0 -10804.1 -278.2 -167.2 -154.2 -15217.7 3049.3 714.9 5.3 -41.1 -35.1 311.5 84.6 15.4 15.4 0.0 0.0
4 -150 to -71 657 -15167.6 -11433.7 -262.1 -139.4 -143.1 -16030.4 2571.7 85.4 21.5 -13.4 -24.1 -501.2 93.5 6.5 13.1 0.0 0.2
5 -70 to 0 2737 -17739.3 -11519.1 -283.6 -126.1 -119.1 -15529.3 0.0 0.0 0.0 0.0 0.0 0.0 94.2 5.8 17.4 0.1 0.2
6 > 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [81]:
# --- 11️⃣ Deep dive: failure modes among Invalid samples ---

def compute_invalid_overlap(group):
    invalid_group = group[group['status'] == 'Invalid']
    n_invalid = len(invalid_group)
    if n_invalid == 0:
        return pd.Series({
            'Invalid_with_FN': 0,
            'Invalid_with_WE3_FP': 0,
            'Invalid_with_WE5_FP': 0,
            '% Invalid_with_FN': np.nan,
            '% Invalid_with_WE3_FP': np.nan,
            '% Invalid_with_WE5_FP': np.nan
        })
    return pd.Series({
        'Invalid_with_FN': invalid_group['FN'].sum(),
        'Invalid_with_WE3_FP': invalid_group['WE3_FP'].sum(),
        'Invalid_with_WE5_FP': invalid_group['WE5_FP'].sum(),
        '% Invalid_with_FN': 100 * invalid_group['FN'].sum() / n_invalid,
        '% Invalid_with_WE3_FP': 100 * invalid_group['WE3_FP'].sum() / n_invalid,
        '% Invalid_with_WE5_FP': 100 * invalid_group['WE5_FP'].sum() / n_invalid
    })

invalid_overlap = (
    df_fluB_filtered.groupby('we1_bins')
    .apply(compute_invalid_overlap)
    .reindex(labels)
)

# --- 12️⃣ Combine with main summary ---
summary_df_detailed = pd.concat([summary_df.set_index('bin_range'), invalid_overlap], axis=1).reset_index()

# --- 13️⃣ Round numeric values ---
cols_to_round = [
    '% Invalid_with_FN', '% Invalid_with_WE3_FP', '% Invalid_with_WE5_FP'
]
summary_df_detailed[cols_to_round] = summary_df_detailed[cols_to_round].round(1)


#--------Columns to keep-------------
summary_df_detailed_filtered = summary_df_detailed[['index',
 'count',
 '% Valid',
 '% Invalid',
 '% FN',
 '% WE3 FP',
 '% WE5 FP',
 'Invalid_with_FN',
 'Invalid_with_WE3_FP',
 'Invalid_with_WE5_FP',
 '% Invalid_with_FN',
 '% Invalid_with_WE3_FP',
 '% Invalid_with_WE5_FP'
]]


# --- 14️⃣ Display the enhanced table ---
display(summary_df_detailed_filtered)
index count % Valid % Invalid % FN % WE3 FP % WE5 FP Invalid_with_FN Invalid_with_WE3_FP Invalid_with_WE5_FP % Invalid_with_FN % Invalid_with_WE3_FP % Invalid_with_WE5_FP
0 < -1000 1 0.0 100.0 100.0 0.0 0.0 1.0 0.0 0.0 100.0 0.0 0.0
1 -1000 to -501 2 50.0 50.0 100.0 0.0 0.0 1.0 0.0 0.0 100.0 0.0 0.0
2 -500 to -301 0 NaN NaN NaN NaN NaN 0.0 0.0 0.0 NaN NaN NaN
3 -300 to -151 39 84.6 15.4 15.4 0.0 0.0 2.0 0.0 0.0 33.3 0.0 0.0
4 -150 to -71 657 93.5 6.5 13.1 0.0 0.2 6.0 0.0 0.0 14.0 0.0 0.0
5 -70 to 0 2737 94.2 5.8 17.4 0.1 0.2 50.0 3.0 1.0 31.2 1.9 0.6
6 > 0 0 NaN NaN NaN NaN NaN 0.0 0.0 0.0 NaN NaN NaN
In [82]:
# --- Deep dive: failure flags that are Invalid ---

def compute_flag_invalid_overlap(group):
    fn_total = group['FN'].sum()
    we3_total = group['WE3_FP'].sum()
    we5_total = group['WE5_FP'].sum()

    fn_invalid = group.loc[group['FN'] & (group['status'] == 'Invalid')].shape[0]
    we3_invalid = group.loc[group['WE3_FP'] & (group['status'] == 'Invalid')].shape[0]
    we5_invalid = group.loc[group['WE5_FP'] & (group['status'] == 'Invalid')].shape[0]

    return pd.Series({
        'FN_total': fn_total,
        'FN_invalid': fn_invalid,
        '% FN_invalid': 100 * fn_invalid / fn_total if fn_total else np.nan,
        'WE3_FP_total': we3_total,
        'WE3_FP_invalid': we3_invalid,
        '% WE3_FP_invalid': 100 * we3_invalid / we3_total if we3_total else np.nan,
        'WE5_FP_total': we5_total,
        'WE5_FP_invalid': we5_invalid,
        '% WE5_FP_invalid': 100 * we5_invalid / we5_total if we5_total else np.nan,
    })

flag_invalid_overlap = (
    df_fluB_filtered.groupby('we1_bins')
    .apply(compute_flag_invalid_overlap)
    .reindex(labels)
)

# --- Combine into your summary ---
summary_df_detailed = pd.concat([summary_df.set_index('bin_range'), flag_invalid_overlap], axis=1).reset_index()

# --- Round percentage columns ---
pct_cols = ['% FN_invalid', '% WE3_FP_invalid', '% WE5_FP_invalid']
summary_df_detailed[pct_cols] = summary_df_detailed[pct_cols].round(1)


#--------Filtering to Desired Columns
summary_df_detailed_filtered = summary_df_detailed[['index',
 'count',
 '% Valid',
 '% Invalid',
 '% FN',
 '% WE5 FP',
 '% WE3 FP',
'% FN_invalid', '% WE3_FP_invalid', '% WE5_FP_invalid'
]]

display(summary_df_detailed_filtered)
index count % Valid % Invalid % FN % WE5 FP % WE3 FP % FN_invalid % WE3_FP_invalid % WE5_FP_invalid
0 < -1000 1 0.0 100.0 100.0 0.0 0.0 100.0 NaN NaN
1 -1000 to -501 2 50.0 50.0 100.0 0.0 0.0 50.0 NaN NaN
2 -500 to -301 0 NaN NaN NaN NaN NaN NaN NaN NaN
3 -300 to -151 39 84.6 15.4 15.4 0.0 0.0 33.3 NaN NaN
4 -150 to -71 657 93.5 6.5 13.1 0.2 0.0 7.0 NaN 0.0
5 -70 to 0 2737 94.2 5.8 17.4 0.2 0.1 10.5 75.0 20.0
6 > 0 0 NaN NaN NaN NaN NaN NaN NaN NaN
In [83]:
import pandas as pd
import numpy as np

# -------------------------------
# 1️⃣ Define continuous thresholds
# -------------------------------
thresholds = list(range(-50, -1025, -25))  # -50, -325, ..., -1000
thresholds.append(-999999)  # catch ≤ -1000
threshold_labels = [str(t) if t != -999999 else "≤ -1000" for t in thresholds]

# -------------------------------
# 2️⃣ Ensure status and FP/FN columns exist
# -------------------------------
def classify_status(row):
    """Return Valid/Invalid based on Warnings/Errors presence."""
    for col in ['Warnings', 'Errors']:
        if col in row and pd.notna(row[col]) and str(row[col]).strip():
            return "Invalid"
    return "Valid"

df_analysis_fluB['status'] = df_analysis_fluB.apply(classify_status, axis=1)

for flag in ['FN', 'WE3_FP', 'WE5_FP']:
    if flag not in df_analysis_fluB.columns:
        df_analysis_fluB[flag] = False

# -------------------------------
# 3️⃣ Loop through thresholds
# -------------------------------
results = []

for i, t in enumerate(thresholds):
    label = threshold_labels[i]

    # Subset for this threshold
    if t == -999999:
        subset = df_analysis_fluB[df_analysis_fluB['we1_step66_tailmean'] <= -1000]
    else:
        subset = df_analysis_fluB[df_analysis_fluB['we1_step66_tailmean'] <= t]

    if subset.empty:
        continue

    n = len(subset)

    # Valid / Invalid counts
    valid = (subset['status'] == 'Valid').sum()
    invalid = (subset['status'] == 'Invalid').sum()

    # FN / FP totals
    fn_total = subset['FN'].sum()
    we3_total = subset['WE3_FP'].sum()
    we5_total = subset['WE5_FP'].sum()

    # FN / FP also invalid
    fn_invalid = subset.loc[subset['FN'] & (subset['status'] == 'Invalid')].shape[0]
    we3_invalid = subset.loc[subset['WE3_FP'] & (subset['status'] == 'Invalid')].shape[0]
    we5_invalid = subset.loc[subset['WE5_FP'] & (subset['status'] == 'Invalid')].shape[0]

    # Append results
    results.append({
        'Threshold': label,
        'Count': n,
        '% Valid': 100 * valid / n,
        '% Invalid': 100 * invalid / n,
        '% FN': 100 * fn_total / n,
        '% WE3 FP': 100 * we3_total / n,
        '% WE5 FP': 100 * we5_total / n,
        'FN_total': fn_total,
        'FN_invalid': fn_invalid,
        '% FN_invalid': 100 * fn_invalid / fn_total if fn_total else np.nan,
        'WE3_FP_total': we3_total,
        'WE3_FP_invalid': we3_invalid,
        '% WE3_FP_invalid': 100 * we3_invalid / we3_total if we3_total else np.nan,
        'WE5_FP_total': we5_total,
        'WE5_FP_invalid': we5_invalid,
        '% WE5_FP_invalid': 100 * we5_invalid / we5_total if we5_total else np.nan
    })

# -------------------------------
# 4️⃣ Convert to DataFrame
# -------------------------------
threshold_summary_df = pd.DataFrame(results)

# Round percentages
pct_cols = [col for col in threshold_summary_df.columns if col.startswith('%')]
threshold_summary_df[pct_cols] = threshold_summary_df[pct_cols].round(1)

# -------------------------------
# 5️⃣ Optional: filter columns
# -------------------------------
threshold_summary_df_filtered_fluB = threshold_summary_df[[
    'Threshold', 'Count', '% Valid', '% Invalid', '% FN', '% WE3 FP', '% WE5 FP',
    'FN_total', 'FN_invalid', '% FN_invalid',
    'WE3_FP_total', 'WE3_FP_invalid', '% WE3_FP_invalid',
    'WE5_FP_total', 'WE5_FP_invalid', '% WE5_FP_invalid'
]]

# -------------------------------
# 6️⃣ Display
# -------------------------------
display(threshold_summary_df_filtered_fluB)
Threshold Count % Valid % Invalid % FN % WE3 FP % WE5 FP FN_total FN_invalid % FN_invalid WE3_FP_total WE3_FP_invalid % WE3_FP_invalid WE5_FP_total WE5_FP_invalid % WE5_FP_invalid
0 -50 1896 93.4 6.6 11.4 0.0 0.2 217 24 11.1 0 0 NaN 4 0 0.0
1 -75 913 91.7 8.3 12.3 0.0 0.0 112 20 17.9 0 0 NaN 0 0 NaN
2 -100 432 90.7 9.3 10.0 0.0 0.0 43 16 37.2 0 0 NaN 0 0 NaN
3 -125 182 89.6 10.4 13.7 0.0 0.0 25 13 52.0 0 0 NaN 0 0 NaN
4 -150 74 79.7 20.3 25.7 0.0 0.0 19 9 47.4 0 0 NaN 0 0 NaN
5 -175 42 83.3 16.7 31.0 0.0 0.0 13 7 53.8 0 0 NaN 0 0 NaN
6 -200 26 73.1 26.9 42.3 0.0 0.0 11 7 63.6 0 0 NaN 0 0 NaN
7 -225 20 65.0 35.0 45.0 0.0 0.0 9 7 77.8 0 0 NaN 0 0 NaN
8 -250 13 46.2 53.8 69.2 0.0 0.0 9 7 77.8 0 0 NaN 0 0 NaN
9 -275 7 28.6 71.4 100.0 0.0 0.0 7 5 71.4 0 0 NaN 0 0 NaN
10 -300 7 28.6 71.4 100.0 0.0 0.0 7 5 71.4 0 0 NaN 0 0 NaN
11 -325 7 28.6 71.4 100.0 0.0 0.0 7 5 71.4 0 0 NaN 0 0 NaN
12 -350 7 28.6 71.4 100.0 0.0 0.0 7 5 71.4 0 0 NaN 0 0 NaN
13 -375 7 28.6 71.4 100.0 0.0 0.0 7 5 71.4 0 0 NaN 0 0 NaN
14 -400 7 28.6 71.4 100.0 0.0 0.0 7 5 71.4 0 0 NaN 0 0 NaN
15 -425 7 28.6 71.4 100.0 0.0 0.0 7 5 71.4 0 0 NaN 0 0 NaN
16 -450 7 28.6 71.4 100.0 0.0 0.0 7 5 71.4 0 0 NaN 0 0 NaN
17 -475 7 28.6 71.4 100.0 0.0 0.0 7 5 71.4 0 0 NaN 0 0 NaN
18 -500 7 28.6 71.4 100.0 0.0 0.0 7 5 71.4 0 0 NaN 0 0 NaN
19 -525 7 28.6 71.4 100.0 0.0 0.0 7 5 71.4 0 0 NaN 0 0 NaN
20 -550 7 28.6 71.4 100.0 0.0 0.0 7 5 71.4 0 0 NaN 0 0 NaN
21 -575 5 0.0 100.0 100.0 0.0 0.0 5 5 100.0 0 0 NaN 0 0 NaN
22 -600 5 0.0 100.0 100.0 0.0 0.0 5 5 100.0 0 0 NaN 0 0 NaN
23 -625 5 0.0 100.0 100.0 0.0 0.0 5 5 100.0 0 0 NaN 0 0 NaN
24 -650 5 0.0 100.0 100.0 0.0 0.0 5 5 100.0 0 0 NaN 0 0 NaN
25 -675 5 0.0 100.0 100.0 0.0 0.0 5 5 100.0 0 0 NaN 0 0 NaN
26 -700 5 0.0 100.0 100.0 0.0 0.0 5 5 100.0 0 0 NaN 0 0 NaN
27 -725 5 0.0 100.0 100.0 0.0 0.0 5 5 100.0 0 0 NaN 0 0 NaN
28 -750 5 0.0 100.0 100.0 0.0 0.0 5 5 100.0 0 0 NaN 0 0 NaN
29 -775 5 0.0 100.0 100.0 0.0 0.0 5 5 100.0 0 0 NaN 0 0 NaN
30 -800 5 0.0 100.0 100.0 0.0 0.0 5 5 100.0 0 0 NaN 0 0 NaN
31 -825 5 0.0 100.0 100.0 0.0 0.0 5 5 100.0 0 0 NaN 0 0 NaN
32 -850 5 0.0 100.0 100.0 0.0 0.0 5 5 100.0 0 0 NaN 0 0 NaN
33 -875 5 0.0 100.0 100.0 0.0 0.0 5 5 100.0 0 0 NaN 0 0 NaN
34 -900 5 0.0 100.0 100.0 0.0 0.0 5 5 100.0 0 0 NaN 0 0 NaN
35 -925 5 0.0 100.0 100.0 0.0 0.0 5 5 100.0 0 0 NaN 0 0 NaN
36 -950 5 0.0 100.0 100.0 0.0 0.0 5 5 100.0 0 0 NaN 0 0 NaN
37 -975 5 0.0 100.0 100.0 0.0 0.0 5 5 100.0 0 0 NaN 0 0 NaN
38 -1000 2 0.0 100.0 100.0 0.0 0.0 2 2 100.0 0 0 NaN 0 0 NaN
39 ≤ -1000 2 0.0 100.0 100.0 0.0 0.0 2 2 100.0 0 0 NaN 0 0 NaN
In [84]:
import matplotlib.pyplot as plt

# Create the figure and main axis
fig, ax1 = plt.subplots(figsize=(10,6))

# --- Primary y-axis: Percentages ---
ax1.plot(threshold_summary_df_filtered_fluB['Threshold'], threshold_summary_df_filtered_fluB['% Invalid'], marker='o', label='% Invalid')
ax1.plot(threshold_summary_df_filtered_fluB['Threshold'], threshold_summary_df_filtered_fluB['% FN'], marker='o', label='% FN')
ax1.plot(threshold_summary_df_filtered_fluB['Threshold'], threshold_summary_df_filtered_fluB['% WE3 FP'], marker='o', label='% WE3 FP')
ax1.plot(threshold_summary_df_filtered_fluB['Threshold'], threshold_summary_df_filtered_fluB['% WE5 FP'], marker='o', label='% WE5 FP')

ax1.set_xlabel('WE1 Tail Mean Threshold')
ax1.set_ylabel('Percentage (%)', color='black')
ax1.tick_params(axis='y', labelcolor='black')
ax1.set_xticklabels(threshold_summary_df_filtered_fluB['Threshold'], rotation=45)
ax1.grid(True)

# --- Secondary y-axis: Count overlay ---
ax2 = ax1.twinx()
ax2.plot(threshold_summary_df_filtered_fluB['Threshold'], threshold_summary_df_filtered_fluB['Count'],
         color='gray', linestyle='--', marker='x', label='Count')
ax2.set_ylabel('Sample Count', color='gray')
ax2.tick_params(axis='y', labelcolor='gray')

# --- Combine legends ---
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines + lines2, labels + labels2, loc='upper left')

# Title and layout
plt.title('Quality Metrics vs WE1 Tail Mean Threshold')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [84]:
 
In [85]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# --- Copy original ---
df = threshold_summary_df_filtered_fluB.copy()

# --- Parse numeric threshold values (handle labels like "≤ -1000") ---
def parse_threshold(x):
    if isinstance(x, str):
        x = x.replace('≤', '').strip()
    try:
        return float(x)
    except:
        return np.nan

df['Threshold_num'] = df['Threshold'].apply(parse_threshold)

# --- Sort thresholds numerically (least → most negative) ---
df = df.sort_values(by='Threshold_num', ascending=False)

# --- Convert Count to % of total ---
df['% Count'] = 100 * df['Count'] / df['Count'].sum()

# --- Compute cumulative sums ---
df['cum_Count'] = df['Count'].cumsum()
df['cum_Invalid'] = (df['% Invalid'] / 100 * df['Count']).cumsum()
df['cum_FN'] = (df['% FN'] / 100 * df['Count']).cumsum()
df['cum_WE3_FP'] = (df['% WE3 FP'] / 100 * df['Count']).cumsum()
df['cum_WE5_FP'] = (df['% WE5 FP'] / 100 * df['Count']).cumsum()

# --- Convert cumulative sums to percentages of total Count ---
total_count = df['Count'].sum()
df['cum_% Invalid'] = 100 * df['cum_Invalid'] / total_count
df['cum_% FN'] = 100 * df['cum_FN'] / total_count
df['cum_% WE3 FP'] = 100 * df['cum_WE3_FP'] / total_count
df['cum_% WE5 FP'] = 100 * df['cum_WE5_FP'] / total_count

# --- Plot ---
fig, ax1 = plt.subplots(figsize=(10,6))

# --- Primary axis: cumulative metrics ---
ax1.plot(df['Threshold_num'], df['cum_% Invalid'], marker='o', label='Cumulative % Invalid')
ax1.plot(df['Threshold_num'], df['cum_% FN'], marker='o', label='Cumulative % FN')
ax1.plot(df['Threshold_num'], df['cum_% WE3 FP'], marker='o', label='Cumulative % WE3 FP')
ax1.plot(df['Threshold_num'], df['cum_% WE5 FP'], marker='o', label='Cumulative % WE5 FP')

ax1.set_xlabel('WE1 Tail Mean Threshold')
ax1.set_ylabel('Cumulative Percentage of Total Samples (%)', color='black')
ax1.tick_params(axis='y', labelcolor='black')
ax1.set_xticks(df['Threshold_num'])
ax1.set_xticklabels(df['Threshold'], rotation=45)
ax1.grid(True)

# --- Reverse x-axis (least negative → most negative) ---
ax1.invert_xaxis()

# --- Secondary axis: % Count per threshold ---
ax2 = ax1.twinx()
ax2.plot(df['Threshold_num'], df['% Count'], color='gray', linestyle='--', marker='x', label='% of Total Count (per bin)')
ax2.set_ylabel('Sample % of Total (bin-level)', color='gray')
ax2.tick_params(axis='y', labelcolor='gray')

# --- Combine legends ---
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines + lines2, labels + labels2, loc='upper left')

plt.title('Cumulative Quality Metrics vs WE1 Tail Mean Threshold')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [86]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Copy source
df = threshold_summary_df_filtered_fluB.copy()

# Parse numeric thresholds (handle "≤ -1000" etc.)
def parse_threshold(x):
    if isinstance(x, str):
        x = x.replace('≤', '').strip()
    try:
        return float(x)
    except:
        return np.nan

df['Threshold_num'] = df['Threshold'].apply(parse_threshold)

# --- Ensure we only use rows with valid numeric threshold ---
df = df.dropna(subset=['Threshold_num']).copy()

# Sort in the natural accumulation order: least negative -> most negative
df = df.sort_values('Threshold_num', ascending=False).reset_index(drop=True)

# Ensure numeric columns
for col in ['Count', '% FN', '% Invalid', '% WE3 FP', '% WE5 FP']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

# Per-bin absolute counts of events (from percent and count)
df['FN_count']     = (df['% FN'] / 100.0) * df['Count']
df['Invalid_count']= (df['% Invalid'] / 100.0) * df['Count']
df['WE3FP_count']  = (df['% WE3 FP'] / 100.0) * df['Count']
df['WE5FP_count']  = (df['% WE5 FP'] / 100.0) * df['Count']

# Running-average style (not usually desired for accumulation)
df['running_avg_%FN'] = df['% FN'].cumsum() / np.arange(1, len(df) + 1)

# True cumulative totals (absolute) and convert to percent of total samples
total_samples = df['Count'].sum() if df['Count'].sum() else 1
df['cum_FN_count'] = df['FN_count'].cumsum()
df['cum_%FN_total'] = 100.0 * df['cum_FN_count'] / total_samples

# Also cum for Invalids / FPs
df['cum_Invalid_count'] = df['Invalid_count'].cumsum()
df['cum_%Invalid_total'] = 100.0 * df['cum_Invalid_count'] / total_samples
df['cum_WE3FP_count'] = df['WE3FP_count'].cumsum()
df['cum_%WE3FP_total'] = 100.0 * df['cum_WE3FP_count'] / total_samples
df['cum_WE5FP_count'] = df['WE5FP_count'].cumsum()
df['cum_%WE5FP_total'] = 100.0 * df['cum_WE5FP_count'] / total_samples

# Diagnostic: check monotonicity (should be non-decreasing)
def is_monotonic_nondec(series):
    return (np.diff(series) >= -1e-8).all()

monotonic_fn = is_monotonic_nondec(df['cum_%FN_total'].values)
if not monotonic_fn:
    print("WARNING: cum_%FN_total is not monotonic non-decreasing. Inspect 'FN_count' and ordering.")

# Add % Total Samples column (aka % Count)
df['% Count'] = 100.0 * df['Count'] / total_samples

# Add Highlight column for rows where % Count ~ 5% (±0.5%)
tolerance = 0.5
df['Highlight_5pct'] = df['% Count'].apply(lambda x: '<< 5% mark' if abs(x - 5.0) <= tolerance else '')

# --- Plot both versions for comparison ---
fig, ax = plt.subplots(figsize=(11,6))

# True cumulative proportions (preferred)
ax.plot(df['Threshold_num'], df['cum_%Invalid_total'], marker='o', label='True cum % Invalid (of total samples)')
ax.plot(df['Threshold_num'], df['cum_%FN_total'], marker='o', label='True cum % FN (of total samples)')
ax.plot(df['Threshold_num'], df['cum_%WE3FP_total'], marker='o', label='True cum % WE3 FP (of total samples)')
ax.plot(df['Threshold_num'], df['cum_%WE5FP_total'], marker='o', label='True cum % WE5 FP (of total samples)')

# Running-average lines (for reference)
ax.plot(df['Threshold_num'], df['running_avg_%FN'], marker='x', linestyle='--', label='Running avg %FN (not cumulative)')

# X axis labels
ax.set_xticks(df['Threshold_num'])
ax.set_xticklabels(df['Threshold'], rotation=45)
ax.invert_xaxis()  # show least negative -> most negative left->right
ax.set_xlabel('WE1 Tail Mean Threshold')
ax.set_ylabel('Percentage (%)')
ax.grid(True)
ax.legend(loc='upper left')

# Right axis: percent of samples per threshold (bin-level)
ax2 = ax.twinx()
ax2.plot(df['Threshold_num'], df['% Count'], color='gray', linestyle='--', marker='x', label='% of total (per threshold)')
ax2.set_ylabel('% of total samples (per threshold)', color='gray')
ax2.tick_params(axis='y', labelcolor='gray')

# Show plot
plt.title('True cumulative vs running-average %FN (and other metrics)')
plt.tight_layout()
plt.show()

# Optional: print a small table for debugging with new columns
display_cols = ['Threshold', 'Count', '% Count', '% FN', 'FN_count', 'cum_FN_count', 'cum_%FN_total', 'Highlight_5pct']
print(df[display_cols].head(12).to_string(index=False))
No description has been provided for this image
Threshold  Count   % Count  % FN  FN_count  cum_FN_count  cum_%FN_total Highlight_5pct
      -50   1896 50.278441  11.4   216.144       216.144       5.731742               
      -75    913 24.211085  12.3   112.299       328.443       8.709706               
     -100    432 11.455847  10.0    43.200       371.643       9.855290               
     -125    182  4.826306  13.7    24.934       396.577      10.516494     << 5% mark
     -150     74  1.962344  25.7    19.018       415.595      11.020817               
     -175     42  1.113763  31.0    13.020       428.615      11.366083               
     -200     26  0.689472  42.3    10.998       439.613      11.657730               
     -225     20  0.530363  45.0     9.000       448.613      11.896394               
     -250     13  0.344736  69.2     8.996       457.609      12.134951               
     -275      7  0.185627 100.0     7.000       464.609      12.320578               
     -300      7  0.185627 100.0     7.000       471.609      12.506205               
     -325      7  0.185627 100.0     7.000       478.609      12.691832               
In [87]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from kneed import KneeLocator  # You'll need to install this package via pip

def analyze_we1_cutoff_tradeoff(threshold_df, threshold_col='Threshold', count_col='Count', fn_percent_col='% FN'):
    """
    Analyze tradeoff between sample exclusion and FN exclusion by WE1 threshold.
    Detect knee point to recommend cutoff.

    Parameters:
    - threshold_df: pd.DataFrame with threshold summary
    - threshold_col: column name with threshold labels
    - count_col: column with sample counts per threshold bin
    - fn_percent_col: column with % FN per threshold bin

    Returns:
    - recommended_cutoff: numeric threshold value at knee point
    - plot figure with annotated tradeoff curve
    - augmented DataFrame with cumulative stats and knee point info
    """
    df = threshold_summary_df_filtered_fluB.copy()

    # Parse numeric threshold (handle "≤ -1000" etc.)
    def parse_threshold(x):
        if isinstance(x, str):
            x = x.replace('≤', '').strip()
        try:
            return float(x)
        except:
            return np.nan

    df['Threshold_num'] = df[threshold_col].apply(parse_threshold)
    df = df.dropna(subset=['Threshold_num']).copy()

    # Sort descending (least negative -> most negative)
    df = df.sort_values('Threshold_num', ascending=False).reset_index(drop=True)

    # Ensure numeric columns
    for col in [count_col, fn_percent_col]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    total_samples = df[count_col].sum()
    total_fn = ((df[fn_percent_col] / 100) * df[count_col]).sum()

    # Calculate FN counts and cumulative sums (reversed for exclusion)
    df['FN_count'] = (df[fn_percent_col] / 100) * df[count_col]

    # Reverse cumulative sums (samples excluded and FN excluded if cutoff at each threshold)
    df['samples_excluded'] = df[count_col][::-1].cumsum()[::-1]
    df['fn_excluded'] = df['FN_count'][::-1].cumsum()[::-1]

    # Percents of totals
    df['% samples_excluded'] = 100 * df['samples_excluded'] / total_samples
    df['% fn_excluded'] = 100 * df['fn_excluded'] / total_fn

    # Detect knee/elbow point
    knee = KneeLocator(
        df['% samples_excluded'], df['% fn_excluded'],
        curve='concave', direction='increasing'
    )
    knee_point = knee.knee

    # Get threshold value at knee if found
    if knee_point is not None:
        recommended_cutoff = df.loc[(df['% samples_excluded'] - knee_point).abs().idxmin(), 'Threshold_num']
    else:
        recommended_cutoff = None

    # Plot tradeoff curve with knee point annotated
    plt.figure(figsize=(9,6))
    plt.plot(df['% samples_excluded'], df['% fn_excluded'], marker='o', label='FN Excluded vs Samples Excluded')
    if knee_point is not None:
        plt.axvline(knee_point, color='red', linestyle='--', label=f'Knee at {knee_point:.2f}% samples excluded')
        knee_y = df.loc[(df['% samples_excluded'] - knee_point).abs().idxmin(), '% fn_excluded']
        plt.scatter(knee_point, knee_y, color='red')
        plt.annotate(f"Threshold: {recommended_cutoff}\nFN Excluded: {knee_y:.1f}%",
                     xy=(knee_point, knee_y), xytext=(knee_point+2, knee_y-5),
                     arrowprops=dict(facecolor='black', shrink=0.05))
    plt.xlabel('% Samples Excluded (WE1 < cutoff)')
    plt.ylabel('% FN Excluded (caught)')
    plt.title('Tradeoff Curve: % FN Excluded vs % Samples Excluded')
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

    return recommended_cutoff, df


recommended_cutoff, augmented_df = analyze_we1_cutoff_tradeoff(threshold_summary_df_filtered_fluB)
print(f"Recommended WE1 threshold cutoff: {recommended_cutoff}")
display(augmented_df.head(10))
No description has been provided for this image
Recommended WE1 threshold cutoff: -150.0
Threshold Count % Valid % Invalid % FN % WE3 FP % WE5 FP FN_total FN_invalid % FN_invalid ... % WE3_FP_invalid WE5_FP_total WE5_FP_invalid % WE5_FP_invalid Threshold_num FN_count samples_excluded fn_excluded % samples_excluded % fn_excluded
0 -50 1896 93.4 6.6 11.4 0.0 0.2 217 24 11.1 ... NaN 4 0 0.0 -50.0 216.144 3771 630.609 100.000000 100.000000
1 -75 913 91.7 8.3 12.3 0.0 0.0 112 20 17.9 ... NaN 0 0 NaN -75.0 112.299 1875 414.465 49.721559 65.724561
2 -100 432 90.7 9.3 10.0 0.0 0.0 43 16 37.2 ... NaN 0 0 NaN -100.0 43.200 962 302.166 25.510475 47.916538
3 -125 182 89.6 10.4 13.7 0.0 0.0 25 13 52.0 ... NaN 0 0 NaN -125.0 24.934 530 258.966 14.054627 41.066017
4 -150 74 79.7 20.3 25.7 0.0 0.0 19 9 47.4 ... NaN 0 0 NaN -150.0 19.018 348 234.032 9.228321 37.112062
5 -175 42 83.3 16.7 31.0 0.0 0.0 13 7 53.8 ... NaN 0 0 NaN -175.0 13.020 274 215.014 7.265977 34.096247
6 -200 26 73.1 26.9 42.3 0.0 0.0 11 7 63.6 ... NaN 0 0 NaN -200.0 10.998 232 201.994 6.152214 32.031576
7 -225 20 65.0 35.0 45.0 0.0 0.0 9 7 77.8 ... NaN 0 0 NaN -225.0 9.000 206 190.996 5.462742 30.287547
8 -250 13 46.2 53.8 69.2 0.0 0.0 9 7 77.8 ... NaN 0 0 NaN -250.0 8.996 186 181.996 4.932379 28.860356
9 -275 7 28.6 71.4 100.0 0.0 0.0 7 5 71.4 ... NaN 0 0 NaN -275.0 7.000 173 173.000 4.587643 27.433798

10 rows × 22 columns

In [88]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# --- CONFIG: use the actual raw dataframe and correct WE1 column name ---
df_raw = df_fluB_filtered.copy()   # your raw fluB dataframe
value_col = 'we1_step66_tailmean'  # correct column used elsewhere in our convo
fn_col = 'FN'                      # binary 0/1 column indicating False Negative

# --- define bin edges you used before (adjust if you want different cut points) ---
bin_edges = [-np.inf, -325, -300, -275, -250, -225, -200, -175, -150, -125, -100, -75, -50, np.inf]
# Make readable labels (we'll display the left-inclusive upper edge representation)
bin_labels = ['<-325', '-325 to -301', '-300 to -276', '-275 to -251', '-250 to -226',
              '-225 to -201', '-200 to -176', '-175 to -151', '-150 to -126', '-125 to -101',
              '-100 to -76', '-75 to -51', '-50 to 0_or_more']

# --- Quick diagnostics: existence & nulls in the WE1 column ---
print("Raw summary:")
print(f"Total rows in df_raw: {len(df_raw)}")
print(f"Column '{value_col}' present? {value_col in df_raw.columns}")
print(f"Non-null values in '{value_col}': {df_raw[value_col].notna().sum()}")
print(f"FN total from raw data (sum of `{fn_col}`): {df_raw[fn_col].sum()}\n")

# --- BINNING (defensive) ---
df_raw['_tmp_bin'] = pd.cut(df_raw[value_col], bins=bin_edges, labels=bin_labels, right=True)

# Show how many rows ended up in bins vs how many were dropped (NaN)
bin_counts_with_nan = df_raw['_tmp_bin'].value_counts(dropna=False).to_frame(name='count')
print("Counts per bin (including NaN):")
print(bin_counts_with_nan)
print("\nNumber of rows with NaN bin (not placed into any bin):", df_raw['_tmp_bin'].isna().sum())

# --- AGGREGATE BY BIN (only non-NaN bins) ---
df_summary = (
    df_raw.dropna(subset=['_tmp_bin'])
    .groupby('_tmp_bin', observed=True)
    .agg(
        Count=('FN', 'size'),
        FN_count=(fn_col, 'sum')
    )
    .reset_index()
)

# Reindex to ensure all bins present in order
df_summary = df_summary.set_index('_tmp_bin').reindex(bin_labels).reset_index()

# --- Compute percentages & cumulative ---
df_summary['% FN'] = 100.0 * df_summary['FN_count'] / df_summary['Count']
df_summary['% FN'] = df_summary['% FN'].fillna(0)  # avoid NaN when Count=0
df_summary['Count'] = df_summary['Count'].fillna(0).astype(int)
df_summary['FN_count'] = df_summary['FN_count'].fillna(0).astype(int)

# --- SORT bins from least negative to most negative ---
df_summary_sorted = df_summary.iloc[::-1].reset_index(drop=True)

# Recalculate cumulative columns in forward order
df_summary_sorted['cum_FN_count'] = df_summary_sorted['FN_count'].cumsum()
total_samples_in_bins = df_summary_sorted['Count'].sum()
df_summary_sorted['cum_%FN_total'] = 100.0 * df_summary_sorted['cum_FN_count'] / total_samples_in_bins if total_samples_in_bins else 0.0
df_summary_sorted['% Count'] = 100.0 * df_summary_sorted['Count'] / total_samples_in_bins if total_samples_in_bins else 0.0
df_summary_sorted['cum_%Samples'] = df_summary_sorted['% Count'].cumsum()

# --- Diagnostics: compare sums to raw data ---
total_fn_raw = int(df_raw[fn_col].sum())
total_fn_binned = int(df_summary['FN_count'].sum())
print(f"\nTotal FN in raw data: {total_fn_raw}")
print(f"Total FN in binned (non-NaN) data: {total_fn_binned}")
print(f"Total rows in raw: {len(df_raw)}, total rows placed in bins: {int(df_raw['_tmp_bin'].notna().sum())}")

# --- Display sorted summary table ---
display_cols = ['_tmp_bin', 'Count', '% Count', '% FN', 'FN_count', 'cum_FN_count', 'cum_%FN_total']
print("\nBinned summary (sorted least negative → most negative):")
print(df_summary_sorted[display_cols].to_string(index=False))

# --- Plot cumulative FN and samples excluded ---
fig, ax1 = plt.subplots(figsize=(11,6))

color_fn = 'tab:blue'
color_sample = 'tab:gray'

ax1.set_xlabel('WE1 Tail Mean Threshold Bins (least negative → most negative)')
ax1.set_ylabel('Cumulative % FN (of samples)', color=color_fn)
ax1.plot(df_summary_sorted['_tmp_bin'], df_summary_sorted['cum_%FN_total'], marker='o', color=color_fn, label='Cumulative % FN')
ax1.tick_params(axis='y', labelcolor=color_fn)
ax1.set_xticklabels(df_summary_sorted['_tmp_bin'], rotation=45, ha='right')

ax2 = ax1.twinx()
ax2.set_ylabel('Cumulative % Samples (excluded)', color=color_sample)
ax2.plot(df_summary_sorted['_tmp_bin'], df_summary_sorted['cum_%Samples'], marker='x', linestyle='--', color=color_sample, label='Cumulative % Samples')
ax2.tick_params(axis='y', labelcolor=color_sample)

fig.suptitle('Cumulative % FN vs % Samples Excluded by WE1 Tail Mean Threshold Bins')
fig.tight_layout()
plt.show()

# --- Optional: print examples for any NaN bins ---
nan_mask = df_raw['_tmp_bin'].isna()
if nan_mask.any():
    print("\nExamples (first 10) of rows where bin is NaN (shows WE1 value):")
    print(df_raw.loc[nan_mask, [value_col, fn_col]].head(10).to_string(index=False))
else:
    print("\nAll rows were placed into bins successfully (no NaNs).")

# --- Clean up temporary column ---
df_raw.drop(columns=['_tmp_bin'], inplace=True)
Raw summary:
Total rows in df_raw: 3436
Column 'we1_step66_tailmean' present? True
Non-null values in 'we1_step66_tailmean': 3436
FN total from raw data (sum of `FN`): 570

Counts per bin (including NaN):
                  count
_tmp_bin               
-50 to 0_or_more   2214
-75 to -51          616
-100 to -76         316
-125 to -101        176
-150 to -126         72
-175 to -151         20
-200 to -176          9
-275 to -251          4
<-325                 3
-250 to -226          3
-225 to -201          3
-325 to -301          0
-300 to -276          0

Number of rows with NaN bin (not placed into any bin): 0

Total FN in raw data: 570
Total FN in binned (non-NaN) data: 570
Total rows in raw: 3436, total rows placed in bins: 3436

Binned summary (sorted least negative → most negative):
        _tmp_bin  Count   % Count       % FN  FN_count  cum_FN_count  cum_%FN_total
-50 to 0_or_more   2214 64.435390  18.654020       413           413      12.019790
      -75 to -51    616 17.927823  12.337662        76           489      14.231665
     -100 to -76    316  9.196740  17.088608        54           543      15.803260
    -125 to -101    176  5.122235   7.954545        14           557      16.210710
    -150 to -126     72  2.095460   5.555556         4           561      16.327125
    -175 to -151     20  0.582072  15.000000         3           564      16.414435
    -200 to -176      9  0.261932  11.111111         1           565      16.443539
    -225 to -201      3  0.087311  33.333333         1           566      16.472643
    -250 to -226      3  0.087311   0.000000         0           566      16.472643
    -275 to -251      4  0.116414  25.000000         1           567      16.501746
    -300 to -276      0  0.000000   0.000000         0           567      16.501746
    -325 to -301      0  0.000000   0.000000         0           567      16.501746
           <-325      3  0.087311 100.000000         3           570      16.589057
No description has been provided for this image
All rows were placed into bins successfully (no NaNs).
In [89]:
print(df_fluB_filtered.shape)
print(df_fluB_filtered['FN'].value_counts())
(3436, 172)
FN
0    2866
1     570
Name: count, dtype: int64

CoV WE1 precheck Analysis¶

In [90]:
df_cov.shape
Out[90]:
(9958, 170)
In [91]:
# Standardizing 'Date Tested' format

df_cov['Date Tested'] = pd.to_datetime(df_cov['Date Tested'], format='mixed')

# Standardizing "Reader SN" strings

df_cov['Reader SN'] = df_cov['Reader SN'].astype(str)
df_cov['Reader SN'] = df_cov['Reader SN'].str.strip()
df_cov['Reader SN'] = df_cov['Reader SN'].str.replace(r'\.0$', '', regex=True)

# Keeping only 745 MUX (666 Flu+CoV PID, modified RV flow check), FABC-1.0.0, and FABC-005-13Aug25

df_cov = df_cov[df_cov['Procedure'].isin(['745 MUX', 'FABC-1.0.0', 'FABC-005-13Aug25'])]

df_cov.shape
Out[91]:
(5335, 170)
In [92]:
# Visualizing 'we1_step66_tailmean values as a function of WE5 values

plt.figure(figsize=(10, 6))
plt.scatter(df_cov['WE5'], df_cov['we1_step66_tailmean'], alpha=0.5)
plt.xlabel('WE5')
plt.ylabel('we1_step66_tailmean')
plt.title('Scatter Plot of we1_step66_tailmean vs. WE2')
# horizontal line at -1000
plt.axhline(y=-1000, color='r', linestyle='--')
# vertical line at -3000
plt.axvline(x=-3000, color='g', linestyle='--')
plt.grid(True)
plt.show()
No description has been provided for this image
In [93]:
# drop duplicated

df_cov_filtered = df_cov.drop_duplicates()

df_cov_filtered.shape
Out[93]:
(3139, 170)
In [94]:
# Building df_analysis_cov forlater analysis work

# Use a clean copy of the data
df_analysis_cov = df_cov.copy()  # or df_cov_filtered, depending on the dataset

# Ensure the column exists and drop NA only for the analysis
df_analysis_cov = df_analysis_cov[df_analysis_cov['we1_step66_tailmean'].notna()]
In [95]:
# Binning the WE1 precheck data

col = "we1_step66_tailmean"
data = df_cov_filtered[col].dropna()

# Define your bins and labels
bins = [float("-inf"), -1000, -500, -300, -150, -70, 0, float("inf")]
labels = [
    "< -1000",
    "-1000 to -501",
    "-500 to -301",
    "-300 to -151",
    "-150 to -71",
    "-70 to 0",
    "> 0"
]

# Apply binning
binned = pd.cut(data, bins=bins, labels=labels, include_lowest=True)
summary = binned.value_counts().sort_index().reset_index()
summary.columns = ["bin_range", "count"]

# Add percentages
summary["percent"] = (summary["count"] / summary["count"].sum() * 100).round(2)

print(summary)
       bin_range  count  percent
0        < -1000      5     0.16
1  -1000 to -501      3     0.10
2   -500 to -301      1     0.03
3   -300 to -151     55     1.75
4    -150 to -71    642    20.45
5       -70 to 0   2433    77.51
6            > 0      0     0.00
In [96]:
# Visualizing the bins

plt.figure(figsize=(8,4))
plt.bar(summary["bin_range"], summary["percent"])
plt.title(f"Distribution of {col}")
plt.xlabel("Value Range")
plt.ylabel("Percent of total")
plt.xticks(rotation=30)
# Annotating percentage above each bar
for i, v in enumerate(summary["percent"]):
    plt.text(i, v, f"{v}%", ha="center", va="bottom")
plt.show()
No description has been provided for this image
In [97]:
# Looking at WE1, WE2, WE3, WE4, WE5, and WE6 values of my individual bins

# Add the binned data as a new column to df_cov_filtered
df_cov_filtered['we1_bins'] = binned

# Group by the created bins and describe the WE columns
we_analysis = df_cov_filtered.groupby('we1_bins')[['WE1', 'WE2', 'WE3', 'WE4', 'WE5', 'WE6']].describe()
display(we_analysis)
WE1 WE2 ... WE5 WE6
count mean std min 25% 50% 75% max count mean ... 75% max count mean std min 25% 50% 75% max
we1_bins
< -1000 5.0 -13856.200000 11665.241069 -26737.0 -19783.0 -19783.0 -2549.0 -429.0 5.0 -889.000000 ... -2151.0 -166.0 5.0 -13160.800000 3773.664956 -15273.0 -15273.0 -15014.0 -13737.00 -6507.0
-1000 to -501 3.0 -401.000000 133.367912 -555.0 -439.5 -324.0 -324.0 -324.0 3.0 -192.000000 ... -85.0 0.0 3.0 -9414.666667 8026.900793 -14049.0 -14049.0 -14049.0 -7097.50 -146.0
-500 to -301 1.0 -20173.000000 NaN -20173.0 -20173.0 -20173.0 -20173.0 -20173.0 1.0 -3273.000000 ... -6967.0 -6967.0 1.0 -16275.000000 NaN -16275.0 -16275.0 -16275.0 -16275.00 -16275.0
-300 to -151 55.0 -12435.218182 6177.176826 -21424.0 -16680.5 -13825.0 -10872.0 -169.0 55.0 -112.890909 ... -17854.0 -195.0 55.0 -15132.036364 3761.278649 -22335.0 -18017.0 -16288.0 -12764.00 -5555.0
-150 to -71 642.0 -14642.644860 5632.281266 -32361.0 -17775.0 -14526.0 -11470.5 -234.0 642.0 -282.137072 ... -14678.0 -66.0 642.0 -16293.744548 3675.340207 -26205.0 -18660.0 -16641.0 -14193.25 -645.0
-70 to 0 2433.0 -15922.009864 6908.200078 -33556.0 -20796.0 -16202.0 -11952.0 0.0 2433.0 -208.009042 ... -15891.0 0.0 2433.0 -15554.863543 3637.992614 -25182.0 -17922.0 -15697.0 -13476.00 0.0
> 0 0.0 NaN NaN NaN NaN NaN NaN NaN 0.0 NaN ... NaN NaN 0.0 NaN NaN NaN NaN NaN NaN NaN

7 rows × 48 columns

In [98]:
import matplotlib.pyplot as plt
import numpy as np

we_means = df_cov_filtered.groupby('we1_bins')[['WE1', 'WE2', 'WE3', 'WE4', 'WE5', 'WE6']].mean()
we_means = we_means.reindex(labels)  # ensure correct bin order

bins = we_means.index
we_cols = we_means.columns
x = np.arange(len(bins))  # the label locations
width = 0.13  # width of each bar

plt.figure(figsize=(10,5))
for i, col in enumerate(we_cols):
    plt.bar(x + i*width, we_means[col], width=width, label=col)

plt.xticks(x + width*2.5, bins, rotation=30)
plt.ylabel("Mean WE# Value")
plt.xlabel("WE1 Bins")
plt.title("Mean WE# Values per Bin")
plt.legend(title="WE#")
plt.legend(bbox_to_anchor=(1.11, 1), loc='upper right')
plt.hlines(y=-3000, xmin=x.min(), xmax=x.max(), linestyles='dashed')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [99]:
import seaborn as sns

plt.figure(figsize=(8,6))
sns.heatmap(we_means, annot=True, fmt=".1f", cmap="coolwarm", cbar_kws={'label': 'Mean Value'})
plt.title("Mean WE# Values by we1_bin")
plt.ylabel("WE1 Bins")
plt.xlabel("WE#")
plt.xticks(rotation=0)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [100]:
import pandas as pd

# 1️⃣ Compute mean WE values per bin
we_means = df_cov_filtered.groupby('we1_bins')[['WE1', 'WE2', 'WE3', 'WE4', 'WE5', 'WE6']].mean()

# Ensure bins are in the correct order
we_means = we_means.reindex(labels)

# 2️⃣ Count samples per bin
bin_counts = df_cov_filtered['we1_bins'].value_counts().reindex(labels)
bin_counts.name = "count"

# 3️⃣ Identify the control bin
control_bin = "-70 to 0"
control_values = we_means.loc[control_bin]

# 4️⃣ Compute Delta_WE# relative to control
delta_cols = {col: f"Delta_{col}" for col in we_means.columns}
delta_values = we_means - control_values
delta_values.rename(columns=delta_cols, inplace=True)

# 5️⃣ Combine counts, means, and deltas into one summary DataFrame
summary_df = pd.concat([bin_counts, we_means, delta_values], axis=1).reset_index()
summary_df.rename(columns={"we1_bins": "bin_range"}, inplace=True)

# 6️⃣ Format numeric columns to 1 decimal place
we_cols = ['WE1','WE2','WE3','WE4','WE5','WE6']
delta_cols = [f"Delta_{col}" for col in we_cols]
summary_df[we_cols + delta_cols] = summary_df[we_cols + delta_cols].round(1)

# 7️⃣ Display the table
display(summary_df)
bin_range count WE1 WE2 WE3 WE4 WE5 WE6 Delta_WE1 Delta_WE2 Delta_WE3 Delta_WE4 Delta_WE5 Delta_WE6
0 < -1000 5 -13856.2 -889.0 -625.8 -1763.0 -12578.2 -13160.8 2065.8 -681.0 -401.0 -1482.6 7946.3 2394.1
1 -1000 to -501 3 -401.0 -192.0 -86.0 -100.0 -113.3 -9414.7 15521.0 16.0 138.8 180.4 20411.2 6140.2
2 -500 to -301 1 -20173.0 -3273.0 -566.0 -283.0 -6967.0 -16275.0 -4251.0 -3065.0 -341.2 -2.6 13557.5 -720.1
3 -300 to -151 55 -12435.2 -112.9 -198.1 -268.5 -18049.9 -15132.0 3486.8 95.1 26.7 11.9 2474.6 422.8
4 -150 to -71 642 -14642.6 -282.1 -239.4 -294.5 -19618.2 -16293.7 1279.4 -74.1 -14.6 -14.0 906.3 -738.9
5 -70 to 0 2433 -15922.0 -208.0 -224.8 -280.4 -20524.5 -15554.9 0.0 0.0 0.0 0.0 0.0 0.0
6 > 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [101]:
# Plot the Delta WE's

delta_only = summary_df.set_index('bin_range')[delta_cols]

plt.figure(figsize=(8,6))
sns.heatmap(delta_only, annot=True, fmt=".1f", cmap="coolwarm", cbar_kws={'label': 'Delta Value'})
plt.title("Delta WE# Values Relative to Control Bin (-70 to 0)")
plt.ylabel("WE1 Bins")
plt.xlabel("WE#")
plt.xticks(rotation=0)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [102]:
import pandas as pd
import numpy as np

# --- 1️⃣ Compute mean WE values per bin ---
we_means = df_cov_filtered.groupby('we1_bins')[['WE1', 'WE2', 'WE3', 'WE4', 'WE5', 'WE6']].mean()
we_means = we_means.reindex(labels)

# --- 2️⃣ Count samples per bin ---
bin_counts = df_cov_filtered['we1_bins'].value_counts().reindex(labels)
bin_counts.name = "count"

# --- 3️⃣ Identify control bin ---
control_bin = "-70 to 0"
control_values = we_means.loc[control_bin]

# --- 4️⃣ Compute ΔWE relative to control ---
delta_cols = {col: f"Delta_{col}" for col in we_means.columns}
delta_values = we_means - control_values
delta_values.rename(columns=delta_cols, inplace=True)

# --- 5️⃣ Cartridge-level status determination ---
def classify_status(row):
    """Return Valid/Invalid based on warnings/errors presence."""
    has_issue = False
    for col in ['Warnings', 'Errors']:
        if col in row and pd.notna(row[col]) and str(row[col]).strip():
            has_issue = True
            break
    return "Invalid" if has_issue else "Valid"

df_cov_filtered['status'] = df_cov_filtered.apply(classify_status, axis=1)

# --- 6️⃣ Define helper flags (FN, WE2_FP, WE3_FP) ---
# Assuming these flags are boolean or identifiable by name in columns:
for flag in ['FN', 'WE2_FP', 'WE3_FP']:
    if flag not in df_cov_filtered.columns:
        df_cov_filtered[flag] = False  # ensure column exists

# --- 7️⃣ Compute percentages per bin ---
def compute_percentages(group):
    n = len(group)
    valid = (group['status'] == 'Valid').sum()
    invalid = (group['status'] == 'Invalid').sum()
    fn = group['FN'].sum()
    we2_fp = group['WE2_FP'].sum()
    we3_fp = group['WE3_FP'].sum()
    return pd.Series({
        '% Valid': 100 * valid / n if n else np.nan,
        '% Invalid': 100 * invalid / n if n else np.nan,
        '% FN': 100 * fn / n if n else np.nan,
        '% WE2 FP': 100 * we2_fp / n if n else np.nan,
        '% WE3 FP': 100 * we3_fp / n if n else np.nan
    })

percentages = df_cov_filtered.groupby('we1_bins').apply(compute_percentages).reindex(labels)

# --- 8️⃣ Combine all results into one summary table ---
summary_df = pd.concat([bin_counts, we_means, delta_values, percentages], axis=1).reset_index()
summary_df.rename(columns={"we1_bins": "bin_range"}, inplace=True)

# --- 9️⃣ Format numeric columns to 1 decimal place ---
we_cols = ['WE1','WE2','WE3','WE4','WE5','WE6']
delta_cols = [f"Delta_{col}" for col in we_cols]
summary_df[we_cols + delta_cols] = summary_df[we_cols + delta_cols].round(1)
summary_df[['% Valid', '% Invalid', '% FN', '% WE2 FP', '% WE3 FP']] = \
    summary_df[['% Valid', '% Invalid', '% FN', '% WE2 FP', '% WE3 FP']].round(1)

# --- 🔟 Display the final summary ---
display(summary_df)
bin_range count WE1 WE2 WE3 WE4 WE5 WE6 Delta_WE1 Delta_WE2 Delta_WE3 Delta_WE4 Delta_WE5 Delta_WE6 % Valid % Invalid % FN % WE2 FP % WE3 FP
0 < -1000 5 -13856.2 -889.0 -625.8 -1763.0 -12578.2 -13160.8 2065.8 -681.0 -401.0 -1482.6 7946.3 2394.1 80.0 20.0 40.0 0.0 0.0
1 -1000 to -501 3 -401.0 -192.0 -86.0 -100.0 -113.3 -9414.7 15521.0 16.0 138.8 180.4 20411.2 6140.2 0.0 100.0 100.0 0.0 0.0
2 -500 to -301 1 -20173.0 -3273.0 -566.0 -283.0 -6967.0 -16275.0 -4251.0 -3065.0 -341.2 -2.6 13557.5 -720.1 100.0 0.0 0.0 100.0 0.0
3 -300 to -151 55 -12435.2 -112.9 -198.1 -268.5 -18049.9 -15132.0 3486.8 95.1 26.7 11.9 2474.6 422.8 100.0 0.0 18.2 0.0 0.0
4 -150 to -71 642 -14642.6 -282.1 -239.4 -294.5 -19618.2 -16293.7 1279.4 -74.1 -14.6 -14.0 906.3 -738.9 95.0 5.0 3.6 2.2 0.0
5 -70 to 0 2433 -15922.0 -208.0 -224.8 -280.4 -20524.5 -15554.9 0.0 0.0 0.0 0.0 0.0 0.0 94.1 5.9 8.7 1.1 0.1
6 > 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [103]:
# --- 11️⃣ Deep dive: failure modes among Invalid samples ---

def compute_invalid_overlap(group):
    invalid_group = group[group['status'] == 'Invalid']
    n_invalid = len(invalid_group)
    if n_invalid == 0:
        return pd.Series({
            'Invalid_with_FN': 0,
            'Invalid_with_WE2_FP': 0,
            'Invalid_with_WE3_FP': 0,
            '% Invalid_with_FN': np.nan,
            '% Invalid_with_WE2_FP': np.nan,
            '% Invalid_with_WE3_FP': np.nan
        })
    return pd.Series({
        'Invalid_with_FN': invalid_group['FN'].sum(),
        'Invalid_with_WE2_FP': invalid_group['WE2_FP'].sum(),
        'Invalid_with_WE3_FP': invalid_group['WE3_FP'].sum(),
        '% Invalid_with_FN': 100 * invalid_group['FN'].sum() / n_invalid,
        '% Invalid_with_WE2_FP': 100 * invalid_group['WE2_FP'].sum() / n_invalid,
        '% Invalid_with_WE3_FP': 100 * invalid_group['WE3_FP'].sum() / n_invalid
    })

invalid_overlap = (
    df_cov_filtered.groupby('we1_bins')
    .apply(compute_invalid_overlap)
    .reindex(labels)
)

# --- 12️⃣ Combine with main summary ---
summary_df_detailed = pd.concat([summary_df.set_index('bin_range'), invalid_overlap], axis=1).reset_index()

# --- 13️⃣ Round numeric values ---
cols_to_round = [
    '% Invalid_with_FN', '% Invalid_with_WE2_FP', '% Invalid_with_WE3_FP'
]
summary_df_detailed[cols_to_round] = summary_df_detailed[cols_to_round].round(1)



#--------Filtering to Desired Columns
#--------Columns to keep-------------
summary_df_detailed_filtered = summary_df_detailed[['index',
 'count',
 '% Valid',
 '% Invalid',
 '% FN',
 '% WE2 FP',
 '% WE3 FP',
 'Invalid_with_FN',
 'Invalid_with_WE2_FP',
 'Invalid_with_WE3_FP',
 '% Invalid_with_FN',
 '% Invalid_with_WE2_FP',
 '% Invalid_with_WE3_FP'
]]


# --- 14️⃣ Display the enhanced table ---
display(summary_df_detailed_filtered)
index count % Valid % Invalid % FN % WE2 FP % WE3 FP Invalid_with_FN Invalid_with_WE2_FP Invalid_with_WE3_FP % Invalid_with_FN % Invalid_with_WE2_FP % Invalid_with_WE3_FP
0 < -1000 5 80.0 20.0 40.0 0.0 0.0 1.0 0.0 0.0 100.0 0.0 0.0
1 -1000 to -501 3 0.0 100.0 100.0 0.0 0.0 3.0 0.0 0.0 100.0 0.0 0.0
2 -500 to -301 1 100.0 0.0 0.0 100.0 0.0 0.0 0.0 0.0 NaN NaN NaN
3 -300 to -151 55 100.0 0.0 18.2 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN
4 -150 to -71 642 95.0 5.0 3.6 2.2 0.0 1.0 0.0 0.0 3.1 0.0 0.0
5 -70 to 0 2433 94.1 5.9 8.7 1.1 0.1 42.0 1.0 0.0 29.2 0.7 0.0
6 > 0 0 NaN NaN NaN NaN NaN 0.0 0.0 0.0 NaN NaN NaN
In [104]:
# --- Deep dive: failure flags that are Invalid ---

def compute_flag_invalid_overlap(group):
    fn_total = group['FN'].sum()
    WE2_total = group['WE2_FP'].sum()
    WE3_total = group['WE3_FP'].sum()

    fn_invalid = group.loc[group['FN'] & (group['status'] == 'Invalid')].shape[0]
    WE2_invalid = group.loc[group['WE2_FP'] & (group['status'] == 'Invalid')].shape[0]
    WE3_invalid = group.loc[group['WE3_FP'] & (group['status'] == 'Invalid')].shape[0]

    return pd.Series({
        'FN_total': fn_total,
        'FN_invalid': fn_invalid,
        '% FN_invalid': 100 * fn_invalid / fn_total if fn_total else np.nan,
        'WE2_FP_total': WE2_total,
        'WE2_FP_invalid': WE2_invalid,
        '% WE2_FP_invalid': 100 * WE2_invalid / WE2_total if WE2_total else np.nan,
        'WE3_FP_total': WE3_total,
        'WE3_FP_invalid': WE3_invalid,
        '% WE3_FP_invalid': 100 * WE3_invalid / WE3_total if WE3_total else np.nan,
    })

flag_invalid_overlap = (
    df_cov_filtered.groupby('we1_bins')
    .apply(compute_flag_invalid_overlap)
    .reindex(labels)
)

# --- Combine into your summary ---
summary_df_detailed = pd.concat([summary_df.set_index('bin_range'), flag_invalid_overlap], axis=1).reset_index()

# --- Round percentage columns ---
pct_cols = ['% FN_invalid', '% WE2_FP_invalid', '% WE3_FP_invalid']
summary_df_detailed[pct_cols] = summary_df_detailed[pct_cols].round(1)


#--------Filtering to Desired Columns
summary_df_detailed_filtered = summary_df_detailed[['index',
 'count',
 '% Valid',
 '% Invalid',
 '% FN',
 '% WE2 FP',
 '% WE3 FP',
'% FN_invalid', '% WE2_FP_invalid', '% WE3_FP_invalid'
]]

display(summary_df_detailed_filtered)
index count % Valid % Invalid % FN % WE2 FP % WE3 FP % FN_invalid % WE2_FP_invalid % WE3_FP_invalid
0 < -1000 5 80.0 20.0 40.0 0.0 0.0 50.0 NaN NaN
1 -1000 to -501 3 0.0 100.0 100.0 0.0 0.0 100.0 NaN NaN
2 -500 to -301 1 100.0 0.0 0.0 100.0 0.0 NaN 0.0 NaN
3 -300 to -151 55 100.0 0.0 18.2 0.0 0.0 0.0 NaN NaN
4 -150 to -71 642 95.0 5.0 3.6 2.2 0.0 4.3 0.0 NaN
5 -70 to 0 2433 94.1 5.9 8.7 1.1 0.1 19.8 3.8 0.0
6 > 0 0 NaN NaN NaN NaN NaN NaN NaN NaN
In [105]:
import pandas as pd
import numpy as np

# -------------------------------
# 1️⃣ Define continuous thresholds
# -------------------------------
thresholds = list(range(-50, -1025, -25))  # -50, -325, ..., -1000
thresholds.append(-999999)  # catch ≤ -1000
threshold_labels = [str(t) if t != -999999 else "≤ -1000" for t in thresholds]

# -------------------------------
# 2️⃣ Ensure status and FP/FN columns exist
# -------------------------------
def classify_status(row):
    """Return Valid/Invalid based on Warnings/Errors presence."""
    for col in ['Warnings', 'Errors']:
        if col in row and pd.notna(row[col]) and str(row[col]).strip():
            return "Invalid"
    return "Valid"

df_analysis_cov['status'] = df_analysis_cov.apply(classify_status, axis=1)

for flag in ['FN', 'WE2_FP', 'WE3_FP']:
    if flag not in df_analysis_cov.columns:
        df_analysis_cov[flag] = False

# -------------------------------
# 3️⃣ Loop through thresholds
# -------------------------------
results = []

for i, t in enumerate(thresholds):
    label = threshold_labels[i]

    # Subset for this threshold
    if t == -999999:
        subset = df_analysis_cov[df_analysis_cov['we1_step66_tailmean'] <= -1000]
    else:
        subset = df_analysis_cov[df_analysis_cov['we1_step66_tailmean'] <= t]

    if subset.empty:
        continue

    n = len(subset)

    # Valid / Invalid counts
    valid = (subset['status'] == 'Valid').sum()
    invalid = (subset['status'] == 'Invalid').sum()

    # FN / FP totals
    fn_total = subset['FN'].sum()
    we3_total = subset['WE2_FP'].sum()
    we5_total = subset['WE3_FP'].sum()

    # FN / FP also invalid
    fn_invalid = subset.loc[subset['FN'] & (subset['status'] == 'Invalid')].shape[0]
    we3_invalid = subset.loc[subset['WE2_FP'] & (subset['status'] == 'Invalid')].shape[0]
    we5_invalid = subset.loc[subset['WE3_FP'] & (subset['status'] == 'Invalid')].shape[0]

    # Append results
    results.append({
        'Threshold': label,
        'Count': n,
        '% Valid': 100 * valid / n,
        '% Invalid': 100 * invalid / n,
        '% FN': 100 * fn_total / n,
        '% WE2 FP': 100 * we2_total / n,
        '% WE3 FP': 100 * we3_total / n,
        'FN_total': fn_total,
        'FN_invalid': fn_invalid,
        '% FN_invalid': 100 * fn_invalid / fn_total if fn_total else np.nan,
        'WE2_FP_total': we2_total,
        'WE2_FP_invalid': we3_invalid,
        '% WE2_FP_invalid': 100 * we2_invalid / we2_total if we2_total else np.nan,
        'WE3_FP_total': we3_total,
        'WE3_FP_invalid': we3_invalid,
        '% WE3_FP_invalid': 100 * we3_invalid / we3_total if we3_total else np.nan
    })

# -------------------------------
# 4️⃣ Convert to DataFrame
# -------------------------------
threshold_summary_df = pd.DataFrame(results)

# Round percentages
pct_cols = [col for col in threshold_summary_df.columns if col.startswith('%')]
threshold_summary_df[pct_cols] = threshold_summary_df[pct_cols].round(1)

# -------------------------------
# 5️⃣ Optional: filter columns
# -------------------------------
threshold_summary_df_filtered_cov = threshold_summary_df[[
    'Threshold', 'Count', '% Valid', '% Invalid', '% FN', '% WE2 FP', '% WE3 FP',
    'FN_total', 'FN_invalid', '% FN_invalid',
    'WE2_FP_total', 'WE2_FP_invalid', '% WE2_FP_invalid',
    'WE3_FP_total', 'WE3_FP_invalid', '% WE3_FP_invalid'
]]

# -------------------------------
# 6️⃣ Display
# -------------------------------
display(threshold_summary_df_filtered_cov)
Threshold Count % Valid % Invalid % FN % WE2 FP % WE3 FP FN_total FN_invalid % FN_invalid WE2_FP_total WE2_FP_invalid % WE2_FP_invalid WE3_FP_total WE3_FP_invalid % WE3_FP_invalid
0 -50 1990 94.9 5.1 4.6 0.0 2.5 91 12 13.2 0 2 NaN 50 2 4.0
1 -75 938 94.5 5.5 4.9 0.0 2.8 46 8 17.4 0 0 NaN 26 0 0.0
2 -100 460 93.5 6.5 6.1 0.0 2.6 28 8 28.6 0 0 NaN 12 0 0.0
3 -125 186 95.7 4.3 11.8 0.0 2.2 22 8 36.4 0 0 NaN 4 0 0.0
4 -150 92 93.5 6.5 19.6 0.0 2.2 18 6 33.3 0 0 NaN 2 0 0.0
5 -175 42 85.7 14.3 33.3 0.0 4.8 14 6 42.9 0 0 NaN 2 0 0.0
6 -200 26 76.9 23.1 46.2 0.0 7.7 12 6 50.0 0 0 NaN 2 0 0.0
7 -225 18 66.7 33.3 44.4 0.0 11.1 8 6 75.0 0 0 NaN 2 0 0.0
8 -250 14 57.1 42.9 57.1 0.0 14.3 8 6 75.0 0 0 NaN 2 0 0.0
9 -275 14 57.1 42.9 57.1 0.0 14.3 8 6 75.0 0 0 NaN 2 0 0.0
10 -300 14 57.1 42.9 57.1 0.0 14.3 8 6 75.0 0 0 NaN 2 0 0.0
11 -325 14 57.1 42.9 57.1 0.0 14.3 8 6 75.0 0 0 NaN 2 0 0.0
12 -350 14 57.1 42.9 57.1 0.0 14.3 8 6 75.0 0 0 NaN 2 0 0.0
13 -375 12 50.0 50.0 66.7 0.0 0.0 8 6 75.0 0 0 NaN 0 0 NaN
14 -400 12 50.0 50.0 66.7 0.0 0.0 8 6 75.0 0 0 NaN 0 0 NaN
15 -425 12 50.0 50.0 66.7 0.0 0.0 8 6 75.0 0 0 NaN 0 0 NaN
16 -450 12 50.0 50.0 66.7 0.0 0.0 8 6 75.0 0 0 NaN 0 0 NaN
17 -475 12 50.0 50.0 66.7 0.0 0.0 8 6 75.0 0 0 NaN 0 0 NaN
18 -500 12 50.0 50.0 66.7 0.0 0.0 8 6 75.0 0 0 NaN 0 0 NaN
19 -525 10 60.0 40.0 60.0 0.0 0.0 6 4 66.7 0 0 NaN 0 0 NaN
20 -550 10 60.0 40.0 60.0 0.0 0.0 6 4 66.7 0 0 NaN 0 0 NaN
21 -575 10 60.0 40.0 60.0 0.0 0.0 6 4 66.7 0 0 NaN 0 0 NaN
22 -600 10 60.0 40.0 60.0 0.0 0.0 6 4 66.7 0 0 NaN 0 0 NaN
23 -625 10 60.0 40.0 60.0 0.0 0.0 6 4 66.7 0 0 NaN 0 0 NaN
24 -650 10 60.0 40.0 60.0 0.0 0.0 6 4 66.7 0 0 NaN 0 0 NaN
25 -675 10 60.0 40.0 60.0 0.0 0.0 6 4 66.7 0 0 NaN 0 0 NaN
26 -700 8 75.0 25.0 50.0 0.0 0.0 4 2 50.0 0 0 NaN 0 0 NaN
27 -725 8 75.0 25.0 50.0 0.0 0.0 4 2 50.0 0 0 NaN 0 0 NaN
28 -750 8 75.0 25.0 50.0 0.0 0.0 4 2 50.0 0 0 NaN 0 0 NaN
29 -775 8 75.0 25.0 50.0 0.0 0.0 4 2 50.0 0 0 NaN 0 0 NaN
30 -800 8 75.0 25.0 50.0 0.0 0.0 4 2 50.0 0 0 NaN 0 0 NaN
31 -825 8 75.0 25.0 50.0 0.0 0.0 4 2 50.0 0 0 NaN 0 0 NaN
32 -850 8 75.0 25.0 50.0 0.0 0.0 4 2 50.0 0 0 NaN 0 0 NaN
33 -875 8 75.0 25.0 50.0 0.0 0.0 4 2 50.0 0 0 NaN 0 0 NaN
34 -900 8 75.0 25.0 50.0 0.0 0.0 4 2 50.0 0 0 NaN 0 0 NaN
35 -925 8 75.0 25.0 50.0 0.0 0.0 4 2 50.0 0 0 NaN 0 0 NaN
36 -950 8 75.0 25.0 50.0 0.0 0.0 4 2 50.0 0 0 NaN 0 0 NaN
37 -975 8 75.0 25.0 50.0 0.0 0.0 4 2 50.0 0 0 NaN 0 0 NaN
38 -1000 8 75.0 25.0 50.0 0.0 0.0 4 2 50.0 0 0 NaN 0 0 NaN
39 ≤ -1000 8 75.0 25.0 50.0 0.0 0.0 4 2 50.0 0 0 NaN 0 0 NaN
In [106]:
import matplotlib.pyplot as plt

# Create the figure and main axis
fig, ax1 = plt.subplots(figsize=(10,6))

# --- Primary y-axis: Percentages ---
ax1.plot(threshold_summary_df_filtered_cov['Threshold'], threshold_summary_df_filtered_cov['% Invalid'], marker='o', label='% Invalid')
ax1.plot(threshold_summary_df_filtered_cov['Threshold'], threshold_summary_df_filtered_cov['% FN'], marker='o', label='% FN')
ax1.plot(threshold_summary_df_filtered_cov['Threshold'], threshold_summary_df_filtered_cov['% WE2 FP'], marker='o', label='% WE2 FP')
ax1.plot(threshold_summary_df_filtered_cov['Threshold'], threshold_summary_df_filtered_cov['% WE3 FP'], marker='o', label='% WE3 FP')

ax1.set_xlabel('WE1 Tail Mean Threshold')
ax1.set_ylabel('Percentage (%)', color='black')
ax1.tick_params(axis='y', labelcolor='black')
ax1.set_xticklabels(threshold_summary_df_filtered_cov['Threshold'], rotation=45)
ax1.grid(True)

# --- Secondary y-axis: Count overlay ---
ax2 = ax1.twinx()
ax2.plot(threshold_summary_df_filtered_cov['Threshold'], threshold_summary_df_filtered_cov['Count'],
         color='gray', linestyle='--', marker='x', label='Count')
ax2.set_ylabel('Sample Count', color='gray')
ax2.tick_params(axis='y', labelcolor='gray')

# --- Combine legends ---
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines + lines2, labels + labels2, loc='upper left')

# Title and layout
plt.title('Quality Metrics vs WE1 Tail Mean Threshold')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [106]:
 
In [107]:
# Running Average plot of Invalids and failure Modes

# --- Copy original ---
df = threshold_summary_df_filtered_cov.copy()

# --- Parse numeric threshold values (handle labels like "≤ -1000") ---
def parse_threshold(x):
    if isinstance(x, str):
        x = x.replace('≤', '').strip()
    try:
        return float(x)
    except:
        return np.nan

df['Threshold_num'] = df['Threshold'].apply(parse_threshold)

# --- Sort thresholds numerically (least → most negative) ---
df = df.sort_values(by='Threshold_num', ascending=False)

# --- Convert Count to % of total ---
df['% Count'] = 100 * df['Count'] / df['Count'].sum()

# --- Compute cumulative sums ---
df['cum_Count'] = df['Count'].cumsum()
df['cum_Invalid'] = (df['% Invalid'] / 100 * df['Count']).cumsum()
df['cum_FN'] = (df['% FN'] / 100 * df['Count']).cumsum()
df['cum_WE2_FP'] = (df['% WE2 FP'] / 100 * df['Count']).cumsum()
df['cum_WE3_FP'] = (df['% WE3 FP'] / 100 * df['Count']).cumsum()

# --- Convert cumulative sums to percentages of total Count ---
total_count = df['Count'].sum()
df['cum_% Invalid'] = 100 * df['cum_Invalid'] / total_count
df['cum_% FN'] = 100 * df['cum_FN'] / total_count
df['cum_% WE2 FP'] = 100 * df['cum_WE2_FP'] / total_count
df['cum_% WE3 FP'] = 100 * df['cum_WE3_FP'] / total_count

# --- Plot ---
fig, ax1 = plt.subplots(figsize=(10,6))

# --- Primary axis: cumulative metrics ---
ax1.plot(df['Threshold_num'], df['cum_% Invalid'], marker='o', label='Cumulative % Invalid')
ax1.plot(df['Threshold_num'], df['cum_% FN'], marker='o', label='Cumulative % FN')
ax1.plot(df['Threshold_num'], df['cum_% WE2 FP'], marker='o', label='Cumulative % WE2 FP')
ax1.plot(df['Threshold_num'], df['cum_% WE3 FP'], marker='o', label='Cumulative % WE3 FP')

ax1.set_xlabel('WE1 Tail Mean Threshold')
ax1.set_ylabel('Cumulative Percentage of Total Samples (%)', color='black')
ax1.tick_params(axis='y', labelcolor='black')
ax1.set_xticks(df['Threshold_num'])
ax1.set_xticklabels(df['Threshold'], rotation=45)
ax1.grid(True)

# --- Reverse x-axis (least negative → most negative) ---
ax1.invert_xaxis()

# --- Secondary axis: % Count per threshold ---
ax2 = ax1.twinx()
ax2.plot(df['Threshold_num'], df['% Count'], color='gray', linestyle='--', marker='x', label='% of Total Count (per bin)')
ax2.set_ylabel('Sample % of Total (bin-level)', color='gray')
ax2.tick_params(axis='y', labelcolor='gray')

# --- Combine legends ---
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines + lines2, labels + labels2, loc='upper left')

plt.title('Cumulative Quality Metrics vs WE1 Tail Mean Threshold')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [108]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Copy source
df = threshold_summary_df_filtered_cov.copy()

# Parse numeric thresholds (handle "≤ -1000" etc.)
def parse_threshold(x):
    if isinstance(x, str):
        x = x.replace('≤', '').strip()
    try:
        return float(x)
    except:
        return np.nan

df['Threshold_num'] = df['Threshold'].apply(parse_threshold)

# --- Ensure we only use rows with valid numeric threshold ---
df = df.dropna(subset=['Threshold_num']).copy()

# Sort in the natural accumulation order: least negative -> most negative
df = df.sort_values('Threshold_num', ascending=False).reset_index(drop=True)

# Ensure numeric columns
for col in ['Count', '% FN', '% Invalid', '% WE2 FP', '% WE3 FP']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

# Per-bin absolute counts of events (from percent and count)
df['FN_count']     = (df['% FN'] / 100.0) * df['Count']
df['Invalid_count']= (df['% Invalid'] / 100.0) * df['Count']
df['WE2FP_count']  = (df['% WE2 FP'] / 100.0) * df['Count']
df['WE3FP_count']  = (df['% WE3 FP'] / 100.0) * df['Count']

# Running-average style (not usually desired for accumulation)
df['running_avg_%FN'] = df['% FN'].cumsum() / np.arange(1, len(df) + 1)

# True cumulative totals (absolute) and convert to percent of total samples
total_samples = df['Count'].sum() if df['Count'].sum() else 1
df['cum_FN_count'] = df['FN_count'].cumsum()
df['cum_%FN_total'] = 100.0 * df['cum_FN_count'] / total_samples

# Also cum for Invalids / FPs
df['cum_Invalid_count'] = df['Invalid_count'].cumsum()
df['cum_%Invalid_total'] = 100.0 * df['cum_Invalid_count'] / total_samples
df['cum_WE2FP_count'] = df['WE2FP_count'].cumsum()
df['cum_%WE2FP_total'] = 100.0 * df['cum_WE2FP_count'] / total_samples
df['cum_WE3FP_count'] = df['WE3FP_count'].cumsum()
df['cum_%WE3FP_total'] = 100.0 * df['cum_WE3FP_count'] / total_samples

# Diagnostic: check monotonicity (should be non-decreasing)
def is_monotonic_nondec(series):
    return (np.diff(series) >= -1e-8).all()

monotonic_fn = is_monotonic_nondec(df['cum_%FN_total'].values)
if not monotonic_fn:
    print("WARNING: cum_%FN_total is not monotonic non-decreasing. Inspect 'FN_count' and ordering.")

# Add % Total Samples column (aka % Count)
df['% Count'] = 100.0 * df['Count'] / total_samples

# Add Highlight column for rows where % Count ~ 5% (±0.5%)
tolerance = 0.5
df['Highlight_5pct'] = df['% Count'].apply(lambda x: '<< 5% mark' if abs(x - 5.0) <= tolerance else '')

# --- Plot both versions for comparison ---
fig, ax = plt.subplots(figsize=(11,6))

# True cumulative proportions (preferred)
ax.plot(df['Threshold_num'], df['cum_%Invalid_total'], marker='o', label='True cum % Invalid (of total samples)')
ax.plot(df['Threshold_num'], df['cum_%FN_total'], marker='o', label='True cum % FN (of total samples)')
ax.plot(df['Threshold_num'], df['cum_%WE2FP_total'], marker='o', label='True cum % WE2 FP (of total samples)')
ax.plot(df['Threshold_num'], df['cum_%WE3FP_total'], marker='o', label='True cum % WE3 FP (of total samples)')

# Running-average lines (for reference)
ax.plot(df['Threshold_num'], df['running_avg_%FN'], marker='x', linestyle='--', label='Running avg %FN (not cumulative)')

# X axis labels
ax.set_xticks(df['Threshold_num'])
ax.set_xticklabels(df['Threshold'], rotation=45)
ax.invert_xaxis()  # show least negative -> most negative left->right
ax.set_xlabel('WE1 Tail Mean Threshold')
ax.set_ylabel('Percentage (%)')
ax.grid(True)
ax.legend(loc='upper left')

# Right axis: percent of samples per threshold (bin-level)
ax2 = ax.twinx()
ax2.plot(df['Threshold_num'], df['% Count'], color='gray', linestyle='--', marker='x', label='% of total (per threshold)')
ax2.set_ylabel('% of total samples (per threshold)', color='gray')
ax2.tick_params(axis='y', labelcolor='gray')

# Show plot
plt.title('True cumulative vs running-average %FN (and other metrics)')
plt.tight_layout()
plt.show()

# Optional: print a small table for debugging with new columns
display_cols = ['Threshold', 'Count', '% Count', '% FN', 'FN_count', 'cum_FN_count', 'cum_%FN_total', 'Highlight_5pct']
print(df[display_cols].head(12).to_string(index=False))
No description has been provided for this image
Threshold  Count   % Count  % FN  FN_count  cum_FN_count  cum_%FN_total Highlight_5pct
      -50   1990 48.822375   4.6    91.540        91.540       2.245829               
      -75    938 23.012758   4.9    45.962       137.502       3.373454               
     -100    460 11.285574   6.1    28.060       165.562       4.061874               
     -125    186  4.563297  11.8    21.948       187.510       4.600343     << 5% mark
     -150     92  2.257115  19.6    18.032       205.542       5.042738               
     -175     42  1.030422  33.3    13.986       219.528       5.385868               
     -200     26  0.637880  46.2    12.012       231.540       5.680569               
     -225     18  0.441609  44.4     7.992       239.532       5.876644               
     -250     14  0.343474  57.1     7.994       247.526       6.072767               
     -275     14  0.343474  57.1     7.994       255.520       6.268891               
     -300     14  0.343474  57.1     7.994       263.514       6.465015               
     -325     14  0.343474  57.1     7.994       271.508       6.661138               
In [109]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from kneed import KneeLocator  # You'll need to install this package via pip

def analyze_we1_cutoff_tradeoff(threshold_df, threshold_col='Threshold', count_col='Count', fn_percent_col='% FN'):
    """
    Analyze tradeoff between sample exclusion and FN exclusion by WE1 threshold.
    Detect knee point to recommend cutoff.

    Parameters:
    - threshold_df: pd.DataFrame with threshold summary
    - threshold_col: column name with threshold labels
    - count_col: column with sample counts per threshold bin
    - fn_percent_col: column with % FN per threshold bin

    Returns:
    - recommended_cutoff: numeric threshold value at knee point
    - plot figure with annotated tradeoff curve
    - augmented DataFrame with cumulative stats and knee point info
    """
    df = threshold_summary_df_filtered_cov.copy()

    # Parse numeric threshold (handle "≤ -1000" etc.)
    def parse_threshold(x):
        if isinstance(x, str):
            x = x.replace('≤', '').strip()
        try:
            return float(x)
        except:
            return np.nan

    df['Threshold_num'] = df[threshold_col].apply(parse_threshold)
    df = df.dropna(subset=['Threshold_num']).copy()

    # Sort descending (least negative -> most negative)
    df = df.sort_values('Threshold_num', ascending=False).reset_index(drop=True)

    # Ensure numeric columns
    for col in [count_col, fn_percent_col]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    total_samples = df[count_col].sum()
    total_fn = ((df[fn_percent_col] / 100) * df[count_col]).sum()

    # Calculate FN counts and cumulative sums (reversed for exclusion)
    df['FN_count'] = (df[fn_percent_col] / 100) * df[count_col]

    # Reverse cumulative sums (samples excluded and FN excluded if cutoff at each threshold)
    df['samples_excluded'] = df[count_col][::-1].cumsum()[::-1]
    df['fn_excluded'] = df['FN_count'][::-1].cumsum()[::-1]

    # Percents of totals
    df['% samples_excluded'] = 100 * df['samples_excluded'] / total_samples
    df['% fn_excluded'] = 100 * df['fn_excluded'] / total_fn

    # Detect knee/elbow point
    knee = KneeLocator(
        df['% samples_excluded'], df['% fn_excluded'],
        curve='concave', direction='increasing'
    )
    knee_point = knee.knee

    # Get threshold value at knee if found
    if knee_point is not None:
        recommended_cutoff = df.loc[(df['% samples_excluded'] - knee_point).abs().idxmin(), 'Threshold_num']
    else:
        recommended_cutoff = None

    # Plot tradeoff curve with knee point annotated
    plt.figure(figsize=(9,6))
    plt.plot(df['% samples_excluded'], df['% fn_excluded'], marker='o', label='FN Excluded vs Samples Excluded')
    if knee_point is not None:
        plt.axvline(knee_point, color='red', linestyle='--', label=f'Knee at {knee_point:.2f}% samples excluded')
        knee_y = df.loc[(df['% samples_excluded'] - knee_point).abs().idxmin(), '% fn_excluded']
        plt.scatter(knee_point, knee_y, color='red')
        plt.annotate(f"Threshold: {recommended_cutoff}\nFN Excluded: {knee_y:.1f}%",
                     xy=(knee_point, knee_y), xytext=(knee_point+2, knee_y-5),
                     arrowprops=dict(facecolor='black', shrink=0.05))
    plt.xlabel('% Samples Excluded (WE1 < cutoff)')
    plt.ylabel('% FN Excluded (caught)')
    plt.title('Tradeoff Curve: % FN Excluded vs % Samples Excluded')
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

    return recommended_cutoff, df


recommended_cutoff, augmented_df = analyze_we1_cutoff_tradeoff(threshold_summary_df_filtered_cov)
print(f"Recommended WE1 threshold cutoff: {recommended_cutoff}")
display(augmented_df.head(10))
No description has been provided for this image
Recommended WE1 threshold cutoff: -125.0
Threshold Count % Valid % Invalid % FN % WE2 FP % WE3 FP FN_total FN_invalid % FN_invalid ... % WE2_FP_invalid WE3_FP_total WE3_FP_invalid % WE3_FP_invalid Threshold_num FN_count samples_excluded fn_excluded % samples_excluded % fn_excluded
0 -50 1990 94.9 5.1 4.6 0.0 2.5 91 12 13.2 ... NaN 50 2 4.0 -50.0 91.540 4076 425.526 100.000000 100.000000
1 -75 938 94.5 5.5 4.9 0.0 2.8 46 8 17.4 ... NaN 26 0 0.0 -75.0 45.962 2086 333.986 51.177625 78.487801
2 -100 460 93.5 6.5 6.1 0.0 2.6 28 8 28.6 ... NaN 12 0 0.0 -100.0 28.060 1148 288.024 28.164868 67.686581
3 -125 186 95.7 4.3 11.8 0.0 2.2 22 8 36.4 ... NaN 4 0 0.0 -125.0 21.948 688 259.964 16.879293 61.092389
4 -150 92 93.5 6.5 19.6 0.0 2.2 18 6 33.3 ... NaN 2 0 0.0 -150.0 18.032 502 238.016 12.315996 55.934537
5 -175 42 85.7 14.3 33.3 0.0 4.8 14 6 42.9 ... NaN 2 0 0.0 -175.0 13.986 410 219.984 10.058881 51.696959
6 -200 26 76.9 23.1 46.2 0.0 7.7 12 6 50.0 ... NaN 2 0 0.0 -200.0 12.012 368 205.998 9.028459 48.410203
7 -225 18 66.7 33.3 44.4 0.0 11.1 8 6 75.0 ... NaN 2 0 0.0 -225.0 7.992 342 193.986 8.390579 45.587344
8 -250 14 57.1 42.9 57.1 0.0 14.3 8 6 75.0 ... NaN 2 0 0.0 -250.0 7.994 324 185.994 7.948970 43.709198
9 -275 14 57.1 42.9 57.1 0.0 14.3 8 6 75.0 ... NaN 2 0 0.0 -275.0 7.994 310 178.000 7.605496 41.830581

10 rows × 22 columns

In [110]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# --- CONFIG: use the actual raw dataframe and correct WE1 column name ---
df_raw = df_cov_filtered.copy()   # your raw fluB dataframe
value_col = 'we1_step66_tailmean'  # correct column used elsewhere in our convo
fn_col = 'FN'                      # binary 0/1 column indicating False Negative

# --- define bin edges you used before (adjust if you want different cut points) ---
bin_edges = [-np.inf, -325, -300, -275, -250, -225, -200, -175, -150, -125, -100, -75, -50, np.inf]
# Make readable labels (we'll display the left-inclusive upper edge representation)
bin_labels = ['<-325', '-325 to -301', '-300 to -276', '-275 to -251', '-250 to -226',
              '-225 to -201', '-200 to -176', '-175 to -151', '-150 to -126', '-125 to -101',
              '-100 to -76', '-75 to -51', '-50 to 0_or_more']

# --- Quick diagnostics: existence & nulls in the WE1 column ---
print("Raw summary:")
print(f"Total rows in df_raw: {len(df_raw)}")
print(f"Column '{value_col}' present? {value_col in df_raw.columns}")
print(f"Non-null values in '{value_col}': {df_raw[value_col].notna().sum()}")
print(f"FN total from raw data (sum of `{fn_col}`): {df_raw[fn_col].sum()}\n")

# --- BINNING (defensive) ---
df_raw['_tmp_bin'] = pd.cut(df_raw[value_col], bins=bin_edges, labels=bin_labels, right=True)

# Show how many rows ended up in bins vs how many were dropped (NaN)
bin_counts_with_nan = df_raw['_tmp_bin'].value_counts(dropna=False).to_frame(name='count')
print("Counts per bin (including NaN):")
print(bin_counts_with_nan)
print("\nNumber of rows with NaN bin (not placed into any bin):", df_raw['_tmp_bin'].isna().sum())

# --- AGGREGATE BY BIN (only non-NaN bins) ---
df_summary = (
    df_raw.dropna(subset=['_tmp_bin'])
    .groupby('_tmp_bin', observed=True)
    .agg(
        Count=('FN', 'size'),
        FN_count=(fn_col, 'sum')
    )
    .reset_index()
)

# Reindex to ensure all bins present in order
df_summary = df_summary.set_index('_tmp_bin').reindex(bin_labels).reset_index()

# --- Compute percentages & cumulative ---
df_summary['% FN'] = 100.0 * df_summary['FN_count'] / df_summary['Count']
df_summary['% FN'] = df_summary['% FN'].fillna(0)  # avoid NaN when Count=0
df_summary['Count'] = df_summary['Count'].fillna(0).astype(int)
df_summary['FN_count'] = df_summary['FN_count'].fillna(0).astype(int)

# --- SORT bins from least negative to most negative ---
df_summary_sorted = df_summary.iloc[::-1].reset_index(drop=True)

# Recalculate cumulative columns in forward order
df_summary_sorted['cum_FN_count'] = df_summary_sorted['FN_count'].cumsum()
total_samples_in_bins = df_summary_sorted['Count'].sum()
df_summary_sorted['cum_%FN_total'] = 100.0 * df_summary_sorted['cum_FN_count'] / total_samples_in_bins if total_samples_in_bins else 0.0
df_summary_sorted['% Count'] = 100.0 * df_summary_sorted['Count'] / total_samples_in_bins if total_samples_in_bins else 0.0
df_summary_sorted['cum_%Samples'] = df_summary_sorted['% Count'].cumsum()

# --- Diagnostics: compare sums to raw data ---
total_fn_raw = int(df_raw[fn_col].sum())
total_fn_binned = int(df_summary['FN_count'].sum())
print(f"\nTotal FN in raw data: {total_fn_raw}")
print(f"Total FN in binned (non-NaN) data: {total_fn_binned}")
print(f"Total rows in raw: {len(df_raw)}, total rows placed in bins: {int(df_raw['_tmp_bin'].notna().sum())}")

# --- Display sorted summary table ---
display_cols = ['_tmp_bin', 'Count', '% Count', '% FN', 'FN_count', 'cum_FN_count', 'cum_%FN_total']
print("\nBinned summary (sorted least negative → most negative):")
print(df_summary_sorted[display_cols].to_string(index=False))

# --- Plot cumulative FN and samples excluded ---
fig, ax1 = plt.subplots(figsize=(11,6))

color_fn = 'tab:blue'
color_sample = 'tab:gray'

ax1.set_xlabel('WE1 Tail Mean Threshold Bins (least negative → most negative)')
ax1.set_ylabel('Cumulative % FN (of samples)', color=color_fn)
ax1.plot(df_summary_sorted['_tmp_bin'], df_summary_sorted['cum_%FN_total'], marker='o', color=color_fn, label='Cumulative % FN')
ax1.tick_params(axis='y', labelcolor=color_fn)
ax1.set_xticklabels(df_summary_sorted['_tmp_bin'], rotation=45, ha='right')

ax2 = ax1.twinx()
ax2.set_ylabel('Cumulative % Samples (excluded)', color=color_sample)
ax2.plot(df_summary_sorted['_tmp_bin'], df_summary_sorted['cum_%Samples'], marker='x', linestyle='--', color=color_sample, label='Cumulative % Samples')
ax2.tick_params(axis='y', labelcolor=color_sample)

fig.suptitle('Cumulative % FN vs % Samples Excluded by WE1 Tail Mean Threshold Bins')
fig.tight_layout()
plt.show()

# --- Optional: print examples for any NaN bins ---
nan_mask = df_raw['_tmp_bin'].isna()
if nan_mask.any():
    print("\nExamples (first 10) of rows where bin is NaN (shows WE1 value):")
    print(df_raw.loc[nan_mask, [value_col, fn_col]].head(10).to_string(index=False))
else:
    print("\nAll rows were placed into bins successfully (no NaNs).")

# --- Clean up temporary column ---
df_raw.drop(columns=['_tmp_bin'], inplace=True)
Raw summary:
Total rows in df_raw: 3139
Column 'we1_step66_tailmean' present? True
Non-null values in 'we1_step66_tailmean': 3139
FN total from raw data (sum of `FN`): 250

Counts per bin (including NaN):
                  count
_tmp_bin               
-50 to 0_or_more   1945
-75 to -51          592
-100 to -76         280
-125 to -101        193
-150 to -126         65
-175 to -151         34
-200 to -176         10
<-325                 9
-225 to -201          7
-250 to -226          4
-275 to -251          0
-325 to -301          0
-300 to -276          0

Number of rows with NaN bin (not placed into any bin): 0

Total FN in raw data: 250
Total FN in binned (non-NaN) data: 250
Total rows in raw: 3139, total rows placed in bins: 3139

Binned summary (sorted least negative → most negative):
        _tmp_bin  Count   % Count      % FN  FN_count  cum_FN_count  cum_%FN_total
-50 to 0_or_more   1945 61.962408  9.768638       190           190       6.052883
      -75 to -51    592 18.859509  4.391892        26           216       6.881172
     -100 to -76    280  8.920038  4.285714        12           228       7.263460
    -125 to -101    193  6.148455  2.590674         5           233       7.422746
    -150 to -126     65  2.070723  3.076923         2           235       7.486461
    -175 to -151     34  1.083147 11.764706         4           239       7.613890
    -200 to -176     10  0.318573 20.000000         2           241       7.677604
    -225 to -201      7  0.223001 57.142857         4           245       7.805033
    -250 to -226      4  0.127429  0.000000         0           245       7.805033
    -275 to -251      0  0.000000  0.000000         0           245       7.805033
    -300 to -276      0  0.000000  0.000000         0           245       7.805033
    -325 to -301      0  0.000000  0.000000         0           245       7.805033
           <-325      9  0.286716 55.555556         5           250       7.964320
No description has been provided for this image
All rows were placed into bins successfully (no NaNs).
In [111]:
print(df_cov_filtered.shape)
print(df_cov_filtered['FN'].value_counts())
(3139, 172)
FN
0    2889
1     250
Name: count, dtype: int64
In [112]:
# Extracting cartridges from -70 to 0 bin in df_cov where FN = 1

cartridges_with_fn = df_cov[(df_cov['we1_step66_tailmean'] > -70) & (df_cov['FN'] ==1)]
cartridges_with_fn.head()
Out[112]:
Study Label Pellet Variable A Variable B Variable C Variable D Procedure Test Sample RP Type RP Cell Count ... we6_step93_max we1_step66_tailmean we2_step66_tailmean we3_step66_tailmean we4_step66_tailmean we5_step66_tailmean we6_step66_tailmean FN WE3_FP WE2_FP
66 FABC-0350 FABC-Tetra-20251003-MT-HF-T6K-A Control RV202 NaN NaN NaN FABC-1.0.0 CoV SCM NaN ... 0.0 -31.714828 0.0 0.0 0.0 0.0 0.0 1 0 0
149 FABC-0350 FABC-Tetra-20251003-MT-HF-T6K-A RV202 + 0.5% Triton X100 NaN NaN NaN FABC-1.0.0 CoV SCM NaN ... 0.0 -15.382642 0.0 0.0 0.0 0.0 0.0 1 0 0
153 FABC-0350 FABC-Tetra-20251003-MT-HF-T6K-A RV202 + 0.5% Triton X100 NaN NaN NaN FABC-1.0.0 CoV SCM NaN ... 0.0 -24.504611 0.0 0.0 0.0 0.0 0.0 1 0 0
154 FABC-0350 FABC-Tetra-20251003-MT-HF-T6K-A RV202 + 0.5% Triton X100 NaN NaN NaN FABC-1.0.0 CoV SCM NaN ... 0.0 -16.569574 0.0 0.0 0.0 0.0 0.0 1 0 0
155 FABC-0350 FABC-Tetra-20251003-MT-HF-T6K-A RV202 + 0.5% Triton X100 NaN NaN NaN FABC-1.0.0 CoV SCM NaN ... 0.0 -16.332187 0.0 0.0 0.0 0.0 0.0 1 0 0

5 rows × 170 columns

In [113]:
# Count of Cov Cartridges with FN's

cartridges_with_fn['cartridgeSerialNumber'].nunique()
Out[113]:
175
In [114]:
# Count of Cov Cartridges with FN's that are also invalid

# Check for non-null and non-empty string values in 'Warnings'
warnings_present = cartridges_with_fn['Warnings'].notna() & (cartridges_with_fn['Warnings'].astype(str).str.strip().astype(bool))

# Count unique cartridges where warnings are present
count_with_warnings = cartridges_with_fn[warnings_present]['cartridgeSerialNumber'].nunique()

print(f'Count of unique cartridges with FN that also have warnings: {count_with_warnings}')
Count of unique cartridges with FN that also have warnings: 34

NTC Analysis¶

In [115]:
# Building df_analysis_ntc forlater analysis_ntc work

# Use a clean copy of the data
df_analysis_ntc = df_ntc.copy()  # or df_cov_filtered, depending on the dataset

# Ensure the column exists and drop NA only for the analysis_ntc
df_analysis_ntc = df_analysis_ntc[df_analysis_ntc['we1_step66_tailmean'].notna()]
In [116]:
import pandas as pd
import numpy as np

# -------------------------------
# 1️⃣ Define continuous thresholds
# -------------------------------
thresholds = list(range(-50, -1025, -25))  # -50, -325, ..., -1000
thresholds.append(-999999)  # catch ≤ -1000
threshold_labels = [str(t) if t != -999999 else "≤ -1000" for t in thresholds]

# -------------------------------
# 2️⃣ Ensure status and FP/FN columns exist
# -------------------------------
def classify_status(row):
    """Return Valid/Invalid based on Warnings/Errors presence."""
    for col in ['Warnings', 'Errors']:
        if col in row and pd.notna(row[col]) and str(row[col]).strip():
            return "Invalid"
    return "Valid"

df_analysis_ntc['status'] = df_analysis_ntc.apply(classify_status, axis=1)

for flag in ['WE2_FP', 'WE3_FP', 'WE5_FP']:
    if flag not in df_analysis_ntc.columns:
        df_analysis_ntc[flag] = False

# -------------------------------
# 3️⃣ Loop through thresholds
# -------------------------------
results = []

for i, t in enumerate(thresholds):
    label = threshold_labels[i]

    # Subset for this threshold
    if t == -999999:
        subset = df_analysis_ntc[df_analysis_ntc['we1_step66_tailmean'] <= -1000]
    else:
        subset = df_analysis_ntc[df_analysis_ntc['we1_step66_tailmean'] <= t]

    if subset.empty:
        continue

    n = len(subset)

    # Valid / Invalid counts
    valid = (subset['status'] == 'Valid').sum()
    invalid = (subset['status'] == 'Invalid').sum()

    # FN / FP totals
    we2_total = subset['WE2_FP'].sum()
    we3_total = subset['WE3_FP'].sum()
    we5_total = subset['WE5_FP'].sum()

    # FN / FP also invalid
    we2_invalid = subset.loc[subset['WE2_FP'] & (subset['status'] == 'Invalid')].shape[0]
    we3_invalid = subset.loc[subset['WE3_FP'] & (subset['status'] == 'Invalid')].shape[0]
    we5_invalid = subset.loc[subset['WE5_FP'] & (subset['status'] == 'Invalid')].shape[0]

    # Append results
    results.append({
        'Threshold': label,
        'Count': n,
        '% Valid': 100 * valid / n,
        '% Invalid': 100 * invalid / n,
        '% WE2 FP': 100 * we2_total / n,
        '% WE3 FP': 100 * we3_total / n,
        '% WE5 FP': 100 * we5_total / n,
        'WE2_FP_total': we2_total,
        'WE2_FP_invalid': we2_invalid,
        '% WE2_FP_invalid': 100 * we2_invalid / we2_total if we2_total else np.nan,
        'WE3_FP_total': we3_total,
        'WE3_FP_invalid': we3_invalid,
        '% WE3_FP_invalid': 100 * we3_invalid / we3_total if we3_total else np.nan,
        'WE5_FP_total': we5_total,
        'WE5_FP_invalid': we5_invalid,
        '% WE5_FP_invalid': 100 * we5_invalid / we5_total if we5_total else np.nan
    })

# -------------------------------
# 4️⃣ Convert to DataFrame
# -------------------------------
threshold_summary_df = pd.DataFrame(results)

# Round percentages
pct_cols = [col for col in threshold_summary_df.columns if col.startswith('%')]
threshold_summary_df[pct_cols] = threshold_summary_df[pct_cols].round(1)

# -------------------------------
# 5️⃣ Optional: filter columns
# -------------------------------
threshold_summary_df_filtered_ntc = threshold_summary_df[[
    'Threshold', 'Count', '% Valid', '% Invalid','% WE2 FP','% WE3 FP', '% WE5 FP',
    'WE2_FP_total', 'WE2_FP_invalid', '% WE2_FP_invalid',
    'WE3_FP_total', 'WE3_FP_invalid', '% WE3_FP_invalid',
    'WE5_FP_total', 'WE5_FP_invalid', '% WE5_FP_invalid'
]]

# -------------------------------
# 6️⃣ Display
# -------------------------------
display(threshold_summary_df_filtered_ntc)
Threshold Count % Valid % Invalid % WE2 FP % WE3 FP % WE5 FP WE2_FP_total WE2_FP_invalid % WE2_FP_invalid WE3_FP_total WE3_FP_invalid % WE3_FP_invalid WE5_FP_total WE5_FP_invalid % WE5_FP_invalid
0 -50 4012 92.6 7.4 1.3 0.5 0.4 51 16 31.4 19 8 42.1 17 7 41.2
1 -75 2032 91.4 8.6 1.5 0.5 0.6 30 11 36.7 11 6 54.5 13 5 38.5
2 -100 1113 87.6 12.4 1.8 0.9 1.0 20 10 50.0 10 6 60.0 11 5 45.5
3 -125 647 83.6 16.4 2.3 1.4 1.4 15 9 60.0 9 5 55.6 9 4 44.4
4 -150 428 77.6 22.4 3.3 1.9 1.4 14 8 57.1 8 4 50.0 6 3 50.0
5 -175 357 75.4 24.6 3.6 2.0 1.7 13 7 53.8 7 4 57.1 6 3 50.0
6 -200 300 71.0 29.0 3.7 2.3 2.0 11 7 63.6 7 4 57.1 6 3 50.0
7 -225 280 70.4 29.6 3.9 2.5 2.1 11 7 63.6 7 4 57.1 6 3 50.0
8 -250 255 68.6 31.4 4.3 2.7 2.4 11 7 63.6 7 4 57.1 6 3 50.0
9 -275 218 66.5 33.5 4.1 3.2 2.8 9 7 77.8 7 4 57.1 6 3 50.0
10 -300 208 66.8 33.2 4.3 3.4 2.9 9 7 77.8 7 4 57.1 6 3 50.0
11 -325 200 67.0 33.0 4.5 3.5 3.0 9 7 77.8 7 4 57.1 6 3 50.0
12 -350 190 66.8 33.2 4.2 3.7 3.2 8 6 75.0 7 4 57.1 6 3 50.0
13 -375 183 67.8 32.2 4.4 3.8 3.3 8 6 75.0 7 4 57.1 6 3 50.0
14 -400 175 68.0 32.0 4.6 4.0 3.4 8 6 75.0 7 4 57.1 6 3 50.0
15 -425 170 68.2 31.8 4.7 4.1 3.5 8 6 75.0 7 4 57.1 6 3 50.0
16 -450 166 68.1 31.9 4.8 4.2 3.6 8 6 75.0 7 4 57.1 6 3 50.0
17 -475 158 69.0 31.0 5.1 4.4 3.8 8 6 75.0 7 4 57.1 6 3 50.0
18 -500 154 69.5 30.5 5.2 4.5 3.9 8 6 75.0 7 4 57.1 6 3 50.0
19 -525 145 71.7 28.3 5.5 4.8 3.4 8 6 75.0 7 4 57.1 5 3 60.0
20 -550 140 71.4 28.6 5.7 5.0 3.6 8 6 75.0 7 4 57.1 5 3 60.0
21 -575 136 71.3 28.7 5.9 5.1 3.7 8 6 75.0 7 4 57.1 5 3 60.0
22 -600 132 71.2 28.8 6.1 5.3 3.8 8 6 75.0 7 4 57.1 5 3 60.0
23 -625 130 71.5 28.5 6.2 5.4 3.8 8 6 75.0 7 4 57.1 5 3 60.0
24 -650 129 71.3 28.7 6.2 5.4 3.9 8 6 75.0 7 4 57.1 5 3 60.0
25 -675 126 71.4 28.6 6.3 5.6 3.2 8 6 75.0 7 4 57.1 4 3 75.0
26 -700 123 71.5 28.5 6.5 5.7 3.3 8 6 75.0 7 4 57.1 4 3 75.0
27 -725 121 71.9 28.1 6.6 5.8 3.3 8 6 75.0 7 4 57.1 4 3 75.0
28 -750 119 73.1 26.9 6.7 5.9 3.4 8 6 75.0 7 4 57.1 4 3 75.0
29 -775 116 72.4 27.6 6.9 6.0 3.4 8 6 75.0 7 4 57.1 4 3 75.0
30 -800 115 72.2 27.8 7.0 6.1 3.5 8 6 75.0 7 4 57.1 4 3 75.0
31 -825 110 73.6 26.4 7.3 6.4 3.6 8 6 75.0 7 4 57.1 4 3 75.0
32 -850 106 74.5 25.5 7.5 6.6 3.8 8 6 75.0 7 4 57.1 4 3 75.0
33 -875 100 74.0 26.0 8.0 7.0 4.0 8 6 75.0 7 4 57.1 4 3 75.0
34 -900 99 73.7 26.3 8.1 7.1 4.0 8 6 75.0 7 4 57.1 4 3 75.0
35 -925 98 74.5 25.5 8.2 7.1 4.1 8 6 75.0 7 4 57.1 4 3 75.0
36 -950 93 73.1 26.9 8.6 7.5 4.3 8 6 75.0 7 4 57.1 4 3 75.0
37 -975 93 73.1 26.9 8.6 7.5 4.3 8 6 75.0 7 4 57.1 4 3 75.0
38 -1000 92 72.8 27.2 8.7 7.6 4.3 8 6 75.0 7 4 57.1 4 3 75.0
39 ≤ -1000 92 72.8 27.2 8.7 7.6 4.3 8 6 75.0 7 4 57.1 4 3 75.0
In [117]:
import matplotlib.pyplot as plt

# Create the figure and main axis
fig, ax1 = plt.subplots(figsize=(10,6))

# --- Primary y-axis: Percentages ---
ax1.plot(threshold_summary_df_filtered_ntc['Threshold'], threshold_summary_df_filtered_ntc['% Invalid'], marker='o', label='% Invalid')
ax1.plot(threshold_summary_df_filtered_ntc['Threshold'], threshold_summary_df_filtered_ntc['% WE2 FP'], marker='o', label='% WE2 FP')
ax1.plot(threshold_summary_df_filtered_ntc['Threshold'], threshold_summary_df_filtered_ntc['% WE3 FP'], marker='o', label='% WE3 FP')
ax1.plot(threshold_summary_df_filtered_ntc['Threshold'], threshold_summary_df_filtered_ntc['% WE5 FP'], marker='o', label='% WE5 FP')

ax1.set_xlabel('WE1 Tail Mean Threshold')
ax1.set_ylabel('Percentage (%)', color='black')
ax1.tick_params(axis='y', labelcolor='black')
ax1.set_xticklabels(threshold_summary_df_filtered_ntc['Threshold'], rotation=45)
ax1.grid(True)

# --- Secondary y-axis: Count overlay ---
ax2 = ax1.twinx()
ax2.plot(threshold_summary_df_filtered_ntc['Threshold'], threshold_summary_df_filtered_ntc['Count'],
         color='gray', linestyle='--', marker='x', label='Count')
ax2.set_ylabel('Sample Count', color='gray')
ax2.tick_params(axis='y', labelcolor='gray')

# --- Combine legends ---
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines + lines2, labels + labels2, loc='upper left')

# Title and layout
plt.title('Quality Metrics vs WE1 Tail Mean Threshold')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [117]:
 
In [118]:
# Running Average plot of Invalids and failure Modes

# --- Copy original ---
df = threshold_summary_df_filtered_ntc.copy()

# --- Parse numeric threshold values (handle labels like "≤ -1000") ---
def parse_threshold(x):
    if isinstance(x, str):
        x = x.replace('≤', '').strip()
    try:
        return float(x)
    except:
        return np.nan

df['Threshold_num'] = df['Threshold'].apply(parse_threshold)

# --- Sort thresholds numerically (least → most negative) ---
df = df.sort_values(by='Threshold_num', ascending=False)

# --- Convert Count to % of total ---
df['% Count'] = 100 * df['Count'] / df['Count'].sum()

# --- Compute cumulative sums ---
df['cum_Count'] = df['Count'].cumsum()
df['cum_Invalid'] = (df['% Invalid'] / 100 * df['Count']).cumsum()
df['cum_WE5_FP'] = (df['% WE5 FP'] / 100 * df['Count']).cumsum()
df['cum_WE2_FP'] = (df['% WE2 FP'] / 100 * df['Count']).cumsum()
df['cum_WE3_FP'] = (df['% WE3 FP'] / 100 * df['Count']).cumsum()

# --- Convert cumulative sums to percentages of total Count ---
total_count = df['Count'].sum()
df['cum_% Invalid'] = 100 * df['cum_Invalid'] / total_count
df['cum_% WE5 FP'] = 100 * df['cum_WE5_FP'] / total_count
df['cum_% WE2 FP'] = 100 * df['cum_WE2_FP'] / total_count
df['cum_% WE3 FP'] = 100 * df['cum_WE3_FP'] / total_count

# --- Plot ---
fig, ax1 = plt.subplots(figsize=(10,6))

# --- Primary axis: cumulative metrics ---
ax1.plot(df['Threshold_num'], df['cum_% Invalid'], marker='o', label='Cumulative % Invalid')
ax1.plot(df['Threshold_num'], df['cum_% WE5 FP'], marker='o', label='Cumulative % WE5 FP')
ax1.plot(df['Threshold_num'], df['cum_% WE2 FP'], marker='o', label='Cumulative % WE2 FP')
ax1.plot(df['Threshold_num'], df['cum_% WE3 FP'], marker='o', label='Cumulative % WE3 FP')

ax1.set_xlabel('WE1 Tail Mean Threshold')
ax1.set_ylabel('Cumulative Percentage of Total Samples (%)', color='black')
ax1.tick_params(axis='y', labelcolor='black')
ax1.set_xticks(df['Threshold_num'])
ax1.set_xticklabels(df['Threshold'], rotation=45)
ax1.grid(True)

# --- Reverse x-axis (least negative → most negative) ---
ax1.invert_xaxis()

# --- Secondary axis: % Count per threshold ---
ax2 = ax1.twinx()
ax2.plot(df['Threshold_num'], df['% Count'], color='gray', linestyle='--', marker='x', label='% of Total Count (per bin)')
ax2.set_ylabel('Sample % of Total (bin-level)', color='gray')
ax2.tick_params(axis='y', labelcolor='gray')

# --- Combine legends ---
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines + lines2, labels + labels2, loc='upper left')

plt.title('Cumulative Quality Metrics vs WE1 Tail Mean Threshold')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [119]:
# Cumulative Plot of Invalids and Failure Modes

# copy source
df = threshold_summary_df_filtered_cov.copy()

# parse numeric thresholds (handle "≤ -1000" etc.)
def parse_threshold(x):
    if isinstance(x, str):
        x = x.replace('≤', '').strip()
    try:
        return float(x)
    except:
        return np.nan

df['Threshold_num'] = df['Threshold'].apply(parse_threshold)

# --- Ensure we only use rows with valid numeric threshold ---
df = df.dropna(subset=['Threshold_num']).copy()

# Sort in the natural accumulation order: least negative -> most negative
df = df.sort_values('Threshold_num', ascending=False).reset_index(drop=True)

# Ensure numeric columns
for col in ['Count', '% FN', '% Invalid', '% WE2 FP', '% WE3 FP']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

# Per-bin absolute counts of events (from percent and count)
df['FN_count']     = (df['% FN'] / 100.0) * df['Count']
df['Invalid_count']= (df['% Invalid'] / 100.0) * df['Count']
df['WE2FP_count']  = (df['% WE2 FP'] / 100.0) * df['Count']
df['WE3FP_count']  = (df['% WE3 FP'] / 100.0) * df['Count']

# --- Running-average style (not usually desired for accumulation) ---
df['running_avg_%FN'] = df['% FN'].cumsum() / np.arange(1, len(df) + 1)

# --- True cumulative totals (absolute) and convert to percent of total samples ---
total_samples = df['Count'].sum() if df['Count'].sum() else 1
df['cum_FN_count'] = df['FN_count'].cumsum()
df['cum_%FN_total'] = 100.0 * df['cum_FN_count'] / total_samples

# Also cum for Invalids / FPs
df['cum_Invalid_count'] = df['Invalid_count'].cumsum()
df['cum_%Invalid_total'] = 100.0 * df['cum_Invalid_count'] / total_samples
df['cum_WE2FP_count'] = df['WE2FP_count'].cumsum()
df['cum_%WE2FP_total'] = 100.0 * df['cum_WE2FP_count'] / total_samples
df['cum_WE3FP_count'] = df['WE3FP_count'].cumsum()
df['cum_%WE3FP_total'] = 100.0 * df['cum_WE3FP_count'] / total_samples

# Diagnostic: check monotonicity (should be non-decreasing)
def is_monotonic_nondec(series):
    return (np.diff(series) >= -1e-8).all()

monotonic_fn = is_monotonic_nondec(df['cum_%FN_total'].values)
if not monotonic_fn:
    print("WARNING: cum_%FN_total is not monotonic non-decreasing. Inspect 'FN_count' and ordering.")

# --- Plot both versions for comparison ---
fig, ax = plt.subplots(figsize=(11,6))

# True cumulative proportions (preferred)
ax.plot(df['Threshold_num'], df['cum_%Invalid_total'], marker='o', label='True cum % Invalid (of total samples)')
ax.plot(df['Threshold_num'], df['cum_%FN_total'], marker='o', label='True cum % FN (of total samples)')
ax.plot(df['Threshold_num'], df['cum_%WE2FP_total'], marker='o', label='True cum % WE2 FP (of total samples)')
ax.plot(df['Threshold_num'], df['cum_%WE3FP_total'], marker='o', label='True cum % WE3 FP (of total samples)')

# Running-average lines (for reference)
ax.plot(df['Threshold_num'], df['running_avg_%FN'], marker='x', linestyle='--', label='Running avg %FN (not cumulative)')

# X axis labels
ax.set_xticks(df['Threshold_num'])
ax.set_xticklabels(df['Threshold'], rotation=45)
ax.invert_xaxis()  # show least negative -> most negative left->right
ax.set_xlabel('WE1 Tail Mean Threshold')
ax.set_ylabel('Percentage (%)')
ax.grid(True)
ax.legend(loc='upper left')

# Right axis: percent of samples per threshold (bin-level)
ax2 = ax.twinx()
df['% Count'] = 100.0 * df['Count'] / total_samples
ax2.plot(df['Threshold_num'], df['% Count'], color='gray', linestyle='--', marker='x', label='% of total (per threshold)')
ax2.set_ylabel('% of total samples (per threshold)', color='gray')
ax2.tick_params(axis='y', labelcolor='gray')

# show
plt.title('True cumulative vs running-average %FN (and other metrics)')
plt.tight_layout()
plt.show()

# Optional: print a small table for debugging
display_cols = ['Threshold','Count','% FN','FN_count','cum_FN_count','cum_%FN_total']
print(df[display_cols].head(12).to_string(index=False))
No description has been provided for this image
Threshold  Count  % FN  FN_count  cum_FN_count  cum_%FN_total
      -50   1990   4.6    91.540        91.540       2.245829
      -75    938   4.9    45.962       137.502       3.373454
     -100    460   6.1    28.060       165.562       4.061874
     -125    186  11.8    21.948       187.510       4.600343
     -150     92  19.6    18.032       205.542       5.042738
     -175     42  33.3    13.986       219.528       5.385868
     -200     26  46.2    12.012       231.540       5.680569
     -225     18  44.4     7.992       239.532       5.876644
     -250     14  57.1     7.994       247.526       6.072767
     -275     14  57.1     7.994       255.520       6.268891
     -300     14  57.1     7.994       263.514       6.465015
     -325     14  57.1     7.994       271.508       6.661138
In [120]:
# Preparing a Merged Summary of all Threshold-Based results

def prepare_summary(df, prefix):
    """
    Prepare a threshold summary DataFrame by:
      - Ensuring 'Threshold' is included
      - Prefixing all other columns
    """
    if 'Threshold' not in df.columns:
        raise ValueError(f"'Threshold' column not found in {prefix} dataframe.")

    # Copy and rename
    df = df.copy()
    rename_map = {col: f"{prefix}_{col}" for col in df.columns if col != 'Threshold'}
    df = df.rename(columns=rename_map)
    return df

# Prepare each dataset dynamically (works even if columns differ)
ntc_df  = prepare_summary(threshold_summary_df_filtered_ntc, "NTC")
fluA_df = prepare_summary(threshold_summary_df_filtered_fluA, "FluA")
fluB_df = prepare_summary(threshold_summary_df_filtered_fluB, "FluB")
cov_df  = prepare_summary(threshold_summary_df_filtered_cov, "CoV")

# Merge all summaries on Threshold (outer join so all thresholds are retained)
merged_summary = ntc_df.merge(fluA_df, on="Threshold", how="outer")
merged_summary = merged_summary.merge(fluB_df, on="Threshold", how="outer")
merged_summary = merged_summary.merge(cov_df, on="Threshold", how="outer")

# Sort Threshold numerically (handles text like '≤ -1000')
def threshold_sort_value(x):
    try:
        return float(x)
    except:
        return -999999  # For strings like '≤ -1000'

merged_summary = merged_summary.sort_values(
    by="Threshold",
    key=lambda col: col.map(threshold_sort_value)
).reset_index(drop=True)

# Clean up column order: Threshold first
cols = ['Threshold'] + [c for c in merged_summary.columns if c != 'Threshold']
merged_summary = merged_summary[cols]

# Display final merged table
display(merged_summary)
Threshold NTC_Count NTC_% Valid NTC_% Invalid NTC_% WE2 FP NTC_% WE3 FP NTC_% WE5 FP NTC_WE2_FP_total NTC_WE2_FP_invalid NTC_% WE2_FP_invalid ... CoV_% WE3 FP CoV_FN_total CoV_FN_invalid CoV_% FN_invalid CoV_WE2_FP_total CoV_WE2_FP_invalid CoV_% WE2_FP_invalid CoV_WE3_FP_total CoV_WE3_FP_invalid CoV_% WE3_FP_invalid
0 ≤ -1000 92 72.8 27.2 8.7 7.6 4.3 8 6 75.0 ... 0.0 4 2 50.0 0 0 NaN 0 0 NaN
1 -1000 92 72.8 27.2 8.7 7.6 4.3 8 6 75.0 ... 0.0 4 2 50.0 0 0 NaN 0 0 NaN
2 -975 93 73.1 26.9 8.6 7.5 4.3 8 6 75.0 ... 0.0 4 2 50.0 0 0 NaN 0 0 NaN
3 -950 93 73.1 26.9 8.6 7.5 4.3 8 6 75.0 ... 0.0 4 2 50.0 0 0 NaN 0 0 NaN
4 -925 98 74.5 25.5 8.2 7.1 4.1 8 6 75.0 ... 0.0 4 2 50.0 0 0 NaN 0 0 NaN
5 -900 99 73.7 26.3 8.1 7.1 4.0 8 6 75.0 ... 0.0 4 2 50.0 0 0 NaN 0 0 NaN
6 -875 100 74.0 26.0 8.0 7.0 4.0 8 6 75.0 ... 0.0 4 2 50.0 0 0 NaN 0 0 NaN
7 -850 106 74.5 25.5 7.5 6.6 3.8 8 6 75.0 ... 0.0 4 2 50.0 0 0 NaN 0 0 NaN
8 -825 110 73.6 26.4 7.3 6.4 3.6 8 6 75.0 ... 0.0 4 2 50.0 0 0 NaN 0 0 NaN
9 -800 115 72.2 27.8 7.0 6.1 3.5 8 6 75.0 ... 0.0 4 2 50.0 0 0 NaN 0 0 NaN
10 -775 116 72.4 27.6 6.9 6.0 3.4 8 6 75.0 ... 0.0 4 2 50.0 0 0 NaN 0 0 NaN
11 -750 119 73.1 26.9 6.7 5.9 3.4 8 6 75.0 ... 0.0 4 2 50.0 0 0 NaN 0 0 NaN
12 -725 121 71.9 28.1 6.6 5.8 3.3 8 6 75.0 ... 0.0 4 2 50.0 0 0 NaN 0 0 NaN
13 -700 123 71.5 28.5 6.5 5.7 3.3 8 6 75.0 ... 0.0 4 2 50.0 0 0 NaN 0 0 NaN
14 -675 126 71.4 28.6 6.3 5.6 3.2 8 6 75.0 ... 0.0 6 4 66.7 0 0 NaN 0 0 NaN
15 -650 129 71.3 28.7 6.2 5.4 3.9 8 6 75.0 ... 0.0 6 4 66.7 0 0 NaN 0 0 NaN
16 -625 130 71.5 28.5 6.2 5.4 3.8 8 6 75.0 ... 0.0 6 4 66.7 0 0 NaN 0 0 NaN
17 -600 132 71.2 28.8 6.1 5.3 3.8 8 6 75.0 ... 0.0 6 4 66.7 0 0 NaN 0 0 NaN
18 -575 136 71.3 28.7 5.9 5.1 3.7 8 6 75.0 ... 0.0 6 4 66.7 0 0 NaN 0 0 NaN
19 -550 140 71.4 28.6 5.7 5.0 3.6 8 6 75.0 ... 0.0 6 4 66.7 0 0 NaN 0 0 NaN
20 -525 145 71.7 28.3 5.5 4.8 3.4 8 6 75.0 ... 0.0 6 4 66.7 0 0 NaN 0 0 NaN
21 -500 154 69.5 30.5 5.2 4.5 3.9 8 6 75.0 ... 0.0 8 6 75.0 0 0 NaN 0 0 NaN
22 -475 158 69.0 31.0 5.1 4.4 3.8 8 6 75.0 ... 0.0 8 6 75.0 0 0 NaN 0 0 NaN
23 -450 166 68.1 31.9 4.8 4.2 3.6 8 6 75.0 ... 0.0 8 6 75.0 0 0 NaN 0 0 NaN
24 -425 170 68.2 31.8 4.7 4.1 3.5 8 6 75.0 ... 0.0 8 6 75.0 0 0 NaN 0 0 NaN
25 -400 175 68.0 32.0 4.6 4.0 3.4 8 6 75.0 ... 0.0 8 6 75.0 0 0 NaN 0 0 NaN
26 -375 183 67.8 32.2 4.4 3.8 3.3 8 6 75.0 ... 0.0 8 6 75.0 0 0 NaN 0 0 NaN
27 -350 190 66.8 33.2 4.2 3.7 3.2 8 6 75.0 ... 14.3 8 6 75.0 0 0 NaN 2 0 0.0
28 -325 200 67.0 33.0 4.5 3.5 3.0 9 7 77.8 ... 14.3 8 6 75.0 0 0 NaN 2 0 0.0
29 -300 208 66.8 33.2 4.3 3.4 2.9 9 7 77.8 ... 14.3 8 6 75.0 0 0 NaN 2 0 0.0
30 -275 218 66.5 33.5 4.1 3.2 2.8 9 7 77.8 ... 14.3 8 6 75.0 0 0 NaN 2 0 0.0
31 -250 255 68.6 31.4 4.3 2.7 2.4 11 7 63.6 ... 14.3 8 6 75.0 0 0 NaN 2 0 0.0
32 -225 280 70.4 29.6 3.9 2.5 2.1 11 7 63.6 ... 11.1 8 6 75.0 0 0 NaN 2 0 0.0
33 -200 300 71.0 29.0 3.7 2.3 2.0 11 7 63.6 ... 7.7 12 6 50.0 0 0 NaN 2 0 0.0
34 -175 357 75.4 24.6 3.6 2.0 1.7 13 7 53.8 ... 4.8 14 6 42.9 0 0 NaN 2 0 0.0
35 -150 428 77.6 22.4 3.3 1.9 1.4 14 8 57.1 ... 2.2 18 6 33.3 0 0 NaN 2 0 0.0
36 -125 647 83.6 16.4 2.3 1.4 1.4 15 9 60.0 ... 2.2 22 8 36.4 0 0 NaN 4 0 0.0
37 -100 1113 87.6 12.4 1.8 0.9 1.0 20 10 50.0 ... 2.6 28 8 28.6 0 0 NaN 12 0 0.0
38 -75 2032 91.4 8.6 1.5 0.5 0.6 30 11 36.7 ... 2.8 46 8 17.4 0 0 NaN 26 0 0.0
39 -50 4012 92.6 7.4 1.3 0.5 0.4 51 16 31.4 ... 2.5 91 12 13.2 0 2 NaN 50 2 4.0

40 rows × 61 columns

In [121]:
merged_summary.to_csv('merged_summary.csv', index=False)